@inProceedings{berdicevskis-etal-2023-superlim-331445, title = {Superlim: A Swedish Language Understanding Evaluation Benchmark}, booktitle = {Proceedings of the 2023 Conference on Empirical Methods in Natural Language Processing, December 6-10, 2023, Singapore / Houda Bouamor, Juan Pino, Kalika Bali (Editors)}, author = {Berdicevskis, Aleksandrs and Bouma, Gerlof and Kurtz, Robin and Morger, Felix and Öhman, Joey and Adesam, Yvonne and Borin, Lars and Dannélls, Dana and Forsberg, Markus and Isbister, Tim and Lindahl, Anna and Malmsten, Martin and Rekathati, Faton and Sahlgren, Magnus and Volodina, Elena and Börjeson, Love and Hengchen, Simon and Tahmasebi, Nina}, year = {2023}, publisher = {Association for Computational Linguistics}, address = {Stroudsburg, PA}, ISBN = {979-8-89176-060-8}, pages = {8137--8153}, } @inProceedings{fridlund-etal-2020-trawling-299694, title = {Trawling the Gulf of Bothnia of News: A Big Data Analysis of the Emergence of Terrorism in Swedish and Finnish Newspapers, 1780–1926}, abstract = {This study combines history domain knowledge and language technology expertise to evaluate and expand on research claims regarding the historical meanings associated with terrorism in Swedish and Finnish contexts. Using a cross-border comparative approach and large newspaper corpora made available by the CLARIN research infrastructure, we explore overlapping national discourses on terrorism, the concept’s historical diversity and its relations to different national contexts. We are particularly interested in testing the hypothesis that substate terrorism’s modern meaning was not yet established in the 19th century but primarily restricted to Russian terrorism. We conclude that our comparative study finds both uniquely national and shared meanings of terrorism and that our study strengthen the hypothesis. In extension, the study also serves as an exploration of the potentials of cross-disciplinary evaluative studies based on extensive corpora and of cross-border comparative approaches to Swedish and Finnish newspaper corpora.}, booktitle = {CLARIN Annual Conference Proceedings 2020. Edited by Costanza Navarretta, Maria Eskevich, 05–07 October 2020, Virtual Edition}, author = {Fridlund, Mats and Olsson, Leif-Jöran and Brodén, Daniel and Borin, Lars}, year = {2020}, publisher = {CLARIN}, } @book{borin-etal-2024-vaccine-341185, title = {Vaccine Hesitancy in the Nordic Countries: Trust and Distrust During the COVID-19 Pandemic}, abstract = {Bringing together studies from across the Nordic region, this book examines the challenges brought by the COVID-19 pandemic, with a particular focus on vaccine hesitancy. Shedding light on the political tensions that emerged as a result of the pandemic and the debates that ensued both within and between the Nordic nations, it investigates the vociferous discussions surrounding the COVID-19 vaccines and their presumed negative side effects through the lens of trust; trust in and between the neighbouring countries, in healthcare systems, fellow citizens, and experts; in public authorities, politicians, researchers, journalists, and pharmaceutical companies. The first volume to explore vaccine hesitancy in the Scandinavian context, this ground-breaking volume offers fresh perspectives on vaccine scepticism not as a form of ignorance or lack of knowledge, but as a manifestation of a more fundamental lack of faith in modern government and science. As such, it will appeal to scholars of sociology, politics, anthropology, media studies, communication and cultural studies with interests in public health, popular and political discourse and questions of public trust. }, author = {Borin, Lars and Hammarlin, Mia Marie and Kokkinakis, Dimitrios and Miegel, Fredrik}, year = {2024}, publisher = {Taylor and Francis}, ISBN = {9781040011614}, } @inProceedings{borin-holmer-2024-tradita-333774, title = {Tradita innovare, innovata tradere. The Gothenburg approach to computational lexicography}, abstract = {Swedish computational lexicography has a long history at the University of Gothenburg, both in its primary role as a central aspect of the scientific study of vocabulary and also as an infrastructural component for conducting research based on language data. Starting in the 1960s, the Språkdata research group pioneered corpus-supported lexicography for Swedish, forming the basis for successive editions of the two main descriptive dictionaries of contemporary Swedish, SAOL and SO. Language technological lexical resources for Swedish have been developed by the research unit/research infrastructure Språkbanken Text since the turn of the millennium, most recently in the framework of the Swedish FrameNet++initiative. After two decades of separation, these two largely mutually independently developed strands of computational lexicography have now joined forces under the umbrella of Språkbanken’s lexical research infrastructure to advance the field technically, methodologically, and scientifically.}, booktitle = {Proceedings of the Huminfra Conference (HiC 2024), 10-11 January, 2024, Gothenburg, Sweden / (Eds. Elena Volodina, Gerlof Bouma, Markus Forsberg, Dimitrios Kokkinakis, David Alfter, Mats Fridlund, Christian Horn, Lars Ahrenberg, Anna Blåder)}, author = {Borin, Lars and Holmer, Louise}, year = {2024}, publisher = {LiU Electronic Press}, address = {Linköping}, ISBN = {978-91-8075-512-2}, } @incollection{fridlund-etal-2022-trawling-319822, title = {Trawling and Trolling for Terrorists in the Digital Gulf of Bothnia: Cross-lingual Text Mining for the Emergence of Terrorism in Swedish and Finnish Newspapers, 1780–1926}, abstract = {In pursuing the historical emergence of the discourse on terrorism, this study trawls the “digital Gulf of Bothnia” in the form of a corpus of combined Swedish and Finnish digitized newspaper texts. Through a cross-lingual exploration of the uses of the concept of terrorism in historical Swedish and Finnish news, we examine meanings anchored in the two culturally close but still decidedly different national political contexts. The study is an outcome of an integrative interdisciplinary effort by Swe-Clarin, using resources accessible through the CLARIN infrastructure to enrich scholarship in the humanities. The capabilities of the corpus tool Korp enable us to affirm prior research on the conceptual history of terrorism, but also to suggest a complex and diverse picture of the connotations of terrorism, both as state and sub-state violence up until the 20th century. At the same time, the study allows us to explore the potentials of cross-lingual text mining for historical analysis of national online newspaper corpora provided by Swe-Clarin and FIN-CLARIN.}, booktitle = {CLARIN: The Infrastructure for Language Resources, eds. Darja Fišer & Andreas Witt}, author = {Fridlund, Mats and Brodén, Daniel and Jauhiainen, Tommi and Malkki, Leena and Olsson, Leif-Jöran and Borin, Lars}, year = {2022}, publisher = {De Gruyter Mouton}, address = {Berlin, Boston}, ISBN = {9783110767346}, pages = {781--802}, } @incollection{borin-etal-2021-introduction-310200, title = {Introduction: Swedish FrameNet++}, abstract = {The Swedish FrameNet++ was designed to be several things. As a digital artifact, it is an integrated panchronic lexical macroresource, primarily for Swedish, but including several other languages, intended as a basic infrastructural component in Swedish language technology research and for developing natural language processing applications. As an activity, it is a long-term R&D initiative, initially aimed at bringing about this macroresource, and now at maintaining and extending it, at promoting its use in language technology research and application development, as well as ensuring that the results of this research and development in their turn are incorporated in the macroresource. As a product of research, it reflects both computational and linguistic approaches to lexicology, lexical semantics, and lexical typology.}, booktitle = {The Swedish FrameNet++. Harmonization, integration, method development and practical language technology applications / editor(s): Dana Dannélls, Lars Borin and Karin Friberg Heppin }, author = {Borin, Lars and Dannélls, Dana and Friberg Heppin, Karin}, year = {2021}, publisher = {John Benjamins Publishing Company}, address = {Amsterdam / Philadelphia}, ISBN = {978 90 272 5848 9}, pages = {3 -- 36}, } @book{dannells-etal-2021-swedish-310036, title = {The Swedish FrameNet++ Harmonization, integration, method development and practical language technology applications}, abstract = {Large computational lexicons are central NLP resources. Swedish FrameNet++ aims to be a versatile full-scale lexical resource for NLP containing many kinds of linguistic information. Although focused on Swedish, this ongoing effort, which includes building a new Swedish framenet and recycling existing lexicons, has offered valuable insights into general aspects of lexical-resource building for NLP, which are discussed in this book: computational and linguistic problems of lexical semantics and lexical typology, the nature of lexical items (words and multiword expressions), achieving interoperability among heterogeneous lexical content, NLP methods for extending and interlinking existing lexicons, and deploying the new resource in practical NLP applications. This book is targeted at everyone with an interest in lexicography, computational lexicography, lexical typology, lexical semantics, linguistics, computational linguistics and related fields. We believe it should be of particular interest to those who are or have been involved in language resource creation, development and evaluation.}, author = {Dannélls, Dana and Borin, Lars and Friberg Heppin, Karin}, year = {2021}, publisher = {John Benjamins Publishing Company}, address = {Amsterdam, Philadelphia}, ISBN = {9789027209900 }, } @incollection{borin-etal-2023-language-337444, title = {Language Report Swedish}, abstract = {Swedish speech and language technology (LT) research goes back over 70 years. This has paid off: there is a national research infrastructure, as well as significant research projects, and Swedish is well-endowed with language resources (LRs) and tools. However, there are gaps that need to be filled, especially high-quality goldstandard LRs required by the most recent deep-learning methods. In the future, we would like to see closer collaborations and communication between the “traditional” LT research community and the burgeoning AI field, the establishment of dedicated academic LT training programmes, and national funding for LT research.}, booktitle = {Cognitive Technologies}, author = {Borin, Lars and Domeij, Rickard and Edlund, Jens and Forsberg, Markus}, year = {2023}, pages = {219--222}, } @incollection{virk-etal-2023-lingfn-337386, title = {LingFN: A Framenet for the Linguistic Domain}, abstract = {Frame semantics is a theory of meaning in natural language, which defines the structure of the lexical semantic resources known as framenets. Both framenets and frame semantics have proved useful for a number of natural language processing (NLP) tasks. However, in this connection framenets have often been criticized for their limited coverage. A proposed reasonable-effort solution to this problem is to develop domain-specific (sublanguage) framenets to complement the corresponding general-language framenets for particular NLP tasks, and in the literature we find such initiatives covering domains such as medicine, soccer, and tourism. In this paper, we report on building a framenet to cover the terms and concepts encountered in descriptive linguistic grammars (written in English) i.e. a framenet for the linguistic domain (LingFN) to complement the general-language BFN.}, booktitle = {Lecture Notes in Computer Science (including subseries Lecture Notes in Artificial Intelligence and Lecture Notes in Bioinformatics)}, author = {Virk, Shafqat and Klang, Per and Borin, Lars and Saxena, Anju}, year = {2023}, ISBN = {9783031243363}, pages = {367--379}, } @incollection{borin-2021-multiword-311388, title = {Multiword expressions – a tough typological nut for Swedish FrameNet++}, booktitle = {The Swedish FrameNet++: Harmonization, integration, method development and practical language technology applications / editor(s): Dana Dannélls, Lars Borin and Karin Friberg Heppin}, author = {Borin, Lars}, year = {2021}, publisher = {John Benjamins}, address = {Amsterdam}, ISBN = {9789027209900}, pages = {221–259}, } @incollection{borin-etal-2021-swedish-311387, title = {Swedish FrameNet++ and comparative linguistics}, booktitle = {The Swedish FrameNet++: Harmonization, integration, method development and practical language technology applications / editor(s): Dana Dannélls, Lars Borin and Karin Friberg Heppin}, author = {Borin, Lars and Saxena, Anju and Virk, Shafqat and Comrie, Bernard}, year = {2021}, publisher = {John Benjamins}, address = {Amsterdam}, ISBN = {9789027209900}, pages = {139–165}, } @inProceedings{virk-etal-2017-automatic-261789, title = {Automatic extraction of typological linguistic features from descriptive grammars}, abstract = {The present paper describes experiments on automatically extracting typological linguistic features of natural languages from traditional written descriptive grammars. The feature-extraction task has high potential value in typological, genealogical, historical, and other related areas of linguistics that make use of databases of structural features of languages. Until now, extraction of such features from grammars has been done manually, which is highly time and labor consuming and becomes prohibitive when extended to the thousands of languages for which linguistic descriptions are available. The system we describe here starts from semantically parsed text over which a set of rules are applied in order to extract feature values. We evaluate the system’s performance on the manually curated Grambank database as the gold standard and report the first measures of precision and recall for this problem.}, booktitle = {Text, Speech, and Dialogue 20th International Conference, TSD 2017, Prague, Czech Republic, August 27-31, 2017, Proceedings}, editor = {Kamil Ekštein and Václav Matoušek.}, author = {Virk, Shafqat and Borin, Lars and Saxena, Anju and Hammarström, Harald}, year = {2017}, publisher = {Springer International Publishing}, address = {Cham}, ISBN = {978-3-319-64205-5}, } @article{lindahl-borin-2024-annotation-333043, title = {Annotation for computational argumentation analysis: Issues and perspectives}, abstract = {Argumentation has long been studied in a number of disciplines, including several branches of linguistics. In recent years, computational processing of argumentation has been added to the list, reflecting a general interest from the field of natural language processing (NLP) in building natural language understanding systems for increasingly intricate language phenomena. Computational argumentation analysis – referred to as argumentation mining in the NLP literature – requires large amounts of real-world text with manually analyzed argumentation. This process is known as annotation in the NLP literature and such annotated datasets are used both as “gold standards” for assessing the quality of NLP applications and as training data for the machine learning algorithms underlying most state of the art approaches to NLP. Argumentation annotation turns out to be complex, both because argumentation can be complex in itself and because it does not come across as a unitary phenomenon in the literature. In this survey we review how argumentation has been studied in other fields, how it has been annotated in NLP and what has been achieved so far. We conclude with describing some important current and future issues to be resolved.}, journal = {Language and Linguistics Compass}, author = {Lindahl, Anna and Borin, Lars}, year = {2024}, volume = {18}, number = {1}, } @inProceedings{virk-etal-2021-data-306964, title = {A Data-Driven Semi-Automatic Framenet Development Methodology }, abstract = {FrameNet is a lexical semantic resource based on the linguistic theory of frame semantics. A number of framenet development strategies have been reported previously and all of them involve exploration of corpora and a fair amount of manual work. Despite previous efforts, there does not exist a well-thought-out automatic/semi-automatic methodology for frame construction. In this paper we propose a data-driven methodology for identification and semi-automatic construction of frames. As a proof of concept, we report on our initial attempts to build a wider-scale framenet for the legal domain (LawFN) using the proposed methodology. The constructed frames are stored in a lexical database and together with the annotated example sentences they have been made available through a web interface.}, booktitle = {Proceedings of the International Conference on Recent Advances in Natural Language Processing, 1–3 September, 2021 / Edited by Galia Angelova, Maria Kunilovskaya, Ruslan Mitkov, Ivelina Nikolova-Koleva}, author = {Virk, Shafqat and Dannélls, Dana and Borin, Lars and Forsberg, Markus}, year = {2021}, publisher = {INCOMA}, address = {Shoumen, Bulgaria}, ISBN = {978-954-452-072-4}, } @incollection{saxena-etal-2022-linguistic-317923, title = {A linguistic sketch of Kanashi}, booktitle = {Synchronic and Diachronic Aspects of Kanashi}, editor = {Anju Saxena and Lars Borin}, author = {Saxena, Anju and Borin, Lars and Comrie, Bernard and Sagar, Padam}, year = {2022}, publisher = {De Gruyter Mouton}, address = {Berlin}, ISBN = {9783110703153}, pages = {53--127}, } @incollection{saxena-borin-2022-introduction-317921, title = {Introduction: Kanashi, its speakers, its linguistic and extralinguistic context}, booktitle = {Synchronic and diachronic aspects of Kanashi}, editor = {Anju Saxena and Lars Borin}, author = {Saxena, Anju and Borin, Lars}, year = {2022}, publisher = {De Gruyter Mouton}, address = {Berlin}, ISBN = {9783110703153}, pages = {3--11}, } @incollection{saxena-etal-2022-clues-317928, title = {Clues to Kanashi prehistory 1: Loanword adaptation in nouns and adjectives}, booktitle = {Synchronic and Diachronic Aspects of Kanashi}, editor = {Anju Saxena and Lars Borin}, author = {Saxena, Anju and Borin, Lars and Comrie, Bernard}, year = {2022}, publisher = {De Gruyter Mouton}, address = {Berlin}, ISBN = {9783110703153}, pages = {173--213}, } @incollection{saxena-etal-2022-linguistic-317924, title = {Linguistic variation: A challenge for describing the phonology of Kanashi}, booktitle = {Synchronic and Diachronic Aspects of Kanashi}, editor = {Anju Saxena and Lars Borin}, author = {Saxena, Anju and Sjöberg, Anna and Sagar, Padam and Borin, Lars}, year = {2022}, publisher = {De Gruyter Mouton}, address = {Berlin}, ISBN = {9783110703153}, pages = {131--144}, } @incollection{saxena-etal-2022-kanashi-317930, title = {Kanashi and West Himalayish: Genealogy, language contact, prehistoric migrations}, booktitle = {Synchronic and Diachronic Aspects of Kanashi}, editor = {Anju Saxena and Lars Borin}, author = {Saxena, Anju and Borin, Lars and Comrie, Bernard}, year = {2022}, publisher = {De Gruyter Mouton}, address = {Berlin}, ISBN = {9783110703153}, pages = {237--254}, } @edited_book{saxena-borin-2022-synchronic-317920, title = {Synchronic and diachronic aspects of Kanashi}, editor = {Saxena, Anju and Borin, Lars}, year = {2022}, publisher = {De Gruyter Mouton}, address = {Berlin}, ISBN = {9783110703153}, } @incollection{saxena-etal-2022-clues-317929, title = {Clues to Kanashi prehistory 2: Loanword adaptation in verbs}, booktitle = {Synchronic and Diachronic Aspects of Kanashi}, editor = {Anju Saxena and Lars Borin}, author = {Saxena, Anju and Borin, Lars and Comrie, Bernard}, year = {2022}, publisher = {De Gruyter Mouton}, address = {Berlin}, ISBN = {9783110703153}, pages = {215--233}, } @incollection{saxena-borin-2022-then-317927, title = {And then there was one: Kanashi numerals from borrowed superdiversity to borrowed uniformity}, booktitle = {Synchronic and Diachronic Aspects of Kanashi}, editor = {Anju Saxena and Lars Borin}, author = {Saxena, Anju and Borin, Lars}, year = {2022}, publisher = {De Gruyter Mouton}, address = {Berlin}, ISBN = {9783110703153}, pages = {145--170}, } @article{rehm-etal-2016-strategic-237609, title = {The strategic impact of META-NET on the regional, national and international level}, abstract = {This article provides an overview of the dissemination work carried out in META-NET from 2010 until 2015; we describe its impact on the regional, national and international level, mainly with regard to politics and the funding situation for LT topics. The article documents the initiative’s work throughout Europe in order to boost progress and innovation in our field.}, journal = {Language resources and evaluation}, author = {Rehm, Georg and Uszkoreit, Hans and Ananiadou, Sophia and Bel, Núria and Bielevičienė, Audronė and Borin, Lars and Branco, António and Budin, Gerhard and Calzolari, Nicoletta and Daelemans, Walter and Garabík, Radovan and Grobelnik, Marko and García-Mateo, Carmen and Genabith, Josef Van and Hajič, Jan and Hernáez, Inma and Judge, John and Koeva, Svetla and Krek, Simon and Krstev, Cvetana and Lindén, Krister and Magnini, Bernardo and Mariani, Joseph and Mcnaught, John and Melero, Maite and Monachini, Monica and Moreno, Asunción and Odijk, Jan and Ogrodniczuk, Maciej and Pęzik, Piotr and Piperidis, Stelios and Przepiórkowski, Adam and Rögnvaldsson, Eiríkur and Rosner, Mike and Pedersen, Bolette Sandford and Skadiņa, Inguna and De Smedt, Koenraad and Tadić, Marko and Thompson, Paul and Tufiş, Dan and Váradi, Tamás and Vasiļjevs, Andrejs and Vider, Kadri and Zabarskaitė, Jolanta}, year = {2016}, volume = {50}, number = {2}, pages = {351--374}, } @article{hammarlin-etal-2023-covid-329784, title = {COVID-19 Vaccine Hesitancy: A Mixed Methods Investigation of Matters of Life and Death.}, abstract = {In this article, hesitancy towards COVID-19 vaccinations is investigated as a phenomenon touching upon existential questions. We argue that it encompasses ideas of illness and health, and also of dying and fear of suffering. Building on a specific strand within anti-vaccination studies, we conjecture that vaccine hesitancy is, to some extent, reasonable, and that this scepticism should be studied with compassion. Through a mixed methods approach, vaccine hesitancy, as it is being expressed in a Swedish digital open forum, is investigated and understood as, on the one hand, a perceived need of protecting one’s body from techno-scientific experiments, and thus the risk of becoming a victim of medicine itself. On the other hand, the community members express what we call a tacit belief in modern medicine by demonstrating their own “expert” pandemic knowledge. The analysis also shows how the COVID-19 pandemic triggers memories of another pandemic, namely the swine flu in 2009–2010, and what we term a medical crisis that occurred then, due to a vaccine thatcaused a rare but severe side effect in Sweden and elsewhere.}, journal = {Journal of Digital Social Research (JDSR)}, author = {Hammarlin, MIa-Marie and Kokkinakis, Dimitrios and Borin, Lars}, year = {2023}, volume = {5}, number = {4}, pages = {31--61}, } @inProceedings{ahlberg-etal-2016-karp-246072, title = {Karp: Språkbanken’s Open Lexical Infrastructure}, booktitle = {Globalex 2016, May 24, Portorož, Slovenia}, author = {Ahlberg, Malin and Borin, Lars and Forsberg, Markus and Olsson, Olof and Schumacher, Anne and Uppström, Jonatan}, year = {2016}, } @techreport{hammarstedt-etal-2022-sparv-318399, title = {Sparv 5 Developer’s Guide}, abstract = {The Sparv Pipeline developed by Språkbanken Text is a text analysis tool run from the command line. This Developer’s Guide describes its general structure and key concepts and serves as an API documentation. Most importantly, it describes how to write plugins for Sparv 5 so that you can add your own functions to the toolkit.}, author = {Hammarstedt, Martin and Schumacher, Anne and Borin, Lars and Forsberg, Markus}, year = {2022}, } @incollection{pettersson-borin-2022-swedish-323276, title = {Swedish Diachronic Corpus}, abstract = {The recently compiled Swedish Diachronic Corpus offers access to a total of approximately 16 billion words, covering texts from the 13th century onwards. The corpus contains 14 main genres, with a number of subgenres, compiled from a wide range of sources, including corpus providers and libraries as well as individual researchers and private citizens. All texts in the corpus follow a consistent format, are extensively annotated with metadata, and freely available for download. We firmly believe that the existence of a Swedish diachronic corpus among the resources offered by CLARIN will open up avenues to new, interesting research questions within humanities research, and be a valuable resource for large-scale studies of the Swedish language throughout history – studies that have previously been impossible to conduct in a thorough and consistent manner. Thanks to its embedding in the CLARIN context it also carries the potential to enable broad historical studies from a comparative European perspective.}, booktitle = {CLARIN: The infrastructure for language resources}, editor = {Darja Fišer and Andreas Witt}, author = {Pettersson, Eva and Borin, Lars}, year = {2022}, publisher = {De Gruyter Mouton}, address = {Berlin}, ISBN = {978-3-11-076734-6}, pages = {561–585}, } @inProceedings{ahlberg-etal-2013-korp-178355, title = {Korp and Karp – a bestiary of language resources: the research infrastructure of Språkbanken}, abstract = {A central activity in Språkbanken, an R&D unit at the University of Gothenburg, is the systematic construction of a research infrastructure based on interoperability and widely accepted standards for metadata and data. The two main components of this infrastructure deal with text corpora and with lexical resources. For modularity and flexibility, both components have a backend, or server-side part, accessed through an API made up of a set of well-defined web services. This means that there can be any number of different user interfaces to these components, corresponding, e.g., to different research needs. Here, we will demonstrate the standard corpus and lexicon search interfaces, designed primarily for linguistic searches: Korp and Karp.}, booktitle = {Proceedings of the 19th Nordic Conference of Computational Linguistics (NODALIDA 2013), May 22–24, 2013, Oslo University, Norway. NEALT Proceedings Series 16}, author = {Ahlberg, Malin and Borin, Lars and Forsberg, Markus and Hammarstedt, Martin and Olsson, Leif-Jöran and Olsson, Olof and Roxendal, Johan and Uppström, Jonatan}, year = {2013}, publisher = {Linköping University Electronic Press}, address = {Linköping}, } @inProceedings{borin-etal-2013-lexical-186032, title = {The lexical editing system of Karp}, abstract = {Karp is the open lexical infrastructure of Språkbanken (the Swedish Language Bank). The infrastructure has three main functions: (1) to support the work on creating, curating, and integrating our various lexical resources; (2) to publish the resources, making them searchable and downloadable; and (3) to offer advanced editing functionalities. An important feature of the lexical infrastructure is also that we maintain a strong bidirectional connection to our corpus infrastructure. At the heart of the infrastructure is the SweFN++ project with the goal to create free Swedish lexical resources geared towards language technology applications. The infrastructure currently hosts 23 Swedish lexical resources. The resources are integrated through links to a pivot lexical resource, SALDO, a large morphological and lexical-semantic resource for modern Swedish.}, booktitle = {Kosem, I., Kallas, J., Gantar, P., Krek, S., Langemets, M., Tuulik, M. (eds.) 2013. Electronic lexicography in the 21st century: thinking outside the paper. Proceedings of the eLex 2013 conference, 17-19 October 2013, Tallinn, Estonia.}, author = {Borin, Lars and Forsberg, Markus and Olsson, Leif-Jöran and Olsson, Olof and Uppström, Jonatan}, year = {2013}, publisher = {Trojina, Institute for Applied Slovene Studies / Eesti Keele Instituut }, address = {Ljubljana/Tallinn}, ISBN = { 978-961-93594-0-2}, } @techreport{hammarstedt-etal-2022-sparv-318405, title = {Sparv 5 User Manual}, abstract = {The Sparv Pipeline developed by Språkbanken Text is a text analysis tool run from the command line. This user manual describes how to get Sparv 5 up and running on your own machine, how to configure it and how to use it for annotating your own corpora.}, author = {Hammarstedt, Martin and Schumacher, Anne and Borin, Lars and Forsberg, Markus}, year = {2022}, publisher = {Institutionen för svenska, flerspråkighet och språkteknologi}, address = {Göteborg}, } @inProceedings{ahlberg-etal-2016-sprakbanken's-246063, title = {Språkbanken’s Open Lexical Infrastructure}, abstract = {Karp is an open lexical infrastructure and a web based tool for searching, exploring and developing lexical resources. Språkbanken currently hosts a number of lexicons in Karp and on-going work aims at broadening the type of resources that can be developed in the system. This abstract gives a short overview of Karp's basic functionality, and describes some current projects and on-going work.}, booktitle = {SLTC 2016. The Sixth Swedish Language Technology Conference. Umeå University, 17-18 November, 2016}, author = {Ahlberg, Malin and Borin, Lars and Forsberg, Markus and Olsson, Olof and Schumacher, Anne and Uppström, Jonatan}, year = {2016}, } @incollection{borin-etal-2021-swedish-311385, title = {Swedish FrameNet++ – lexical samsara}, booktitle = {The Swedish FrameNet++: Harmonization, integration, method development and practical language technology applications / editor(s): Dana Dannélls, Lars Borin and Karin Friberg Heppin}, author = {Borin, Lars and Forsberg, Markus and Lönngren, Lennart and Zechner, Niklas}, year = {2021}, publisher = {John Benjamins}, address = {Amsterdam}, ISBN = {9789027209900}, pages = {69–95}, } @article{borin-2022-that-322872, title = {All that glitters . . . : Interannotator agreement in natural language processing}, abstract = {Evaluation has emerged as a central concern in natural language processing (NLP) over the last few decades. Evaluation is done against a gold standard, a manually linguistically annotated dataset, which is assumed to provide the ground truth against which the accuracy of the NLP system can be assessed automatically. In this article, some methodological questions in connection with the creation of gold standard datasets are discussed, in particular (non-)expectations of linguistic expertise in annotators and the interannotator agreement measure standardly but unreflectedly used as a kind of quality index of NLP gold standards.}, journal = {Nordlyd}, author = {Borin, Lars}, year = {2022}, volume = {46}, number = {1}, pages = {19--26}, } @inProceedings{kokkinakis-etal-2022-necessity-321865, title = {The necessity of digital health communication in social media to boost COVID-19 vaccine acceptance. }, booktitle = {ICA Post Conference: Digital Health Communication: Issues and Perspectives. University of Burgundy Franche-Comté, Dijon, France.}, author = {Kokkinakis, Dimitrios and Hammarlin, Mia-Marie and Borin, Lars and Miegel, Fredrik}, year = {2022}, } @misc{tahmasebi-etal-2022-proceedings-316661, title = {Proceedings of the 3rd Workshop on Computational Approaches to Historical Language Change, May 26-27, 2022, Dublin, Ireland}, author = {Tahmasebi, Nina and Montariol, Syrielle and Kutuzov, Andrey and Hengchen, Simon and Dubossarsky, Haim and Borin, Lars}, year = {2022}, publisher = {Association for Computational Linguistics}, address = {Stroudsburg, PA}, ISBN = {978-1-955917-42-1}, } @inProceedings{hammarlin-etal-2021-vaccine-307227, title = {Vaccine hesitancy – trust and distrust in medical expertise and authorities}, abstract = {The increase of vaccine hesitancy is singled out by WHO as one of the ten most important and urgent threats to global health (https://www.who.int/emergencies/ten-threats-to-global-health-in-2019). Diseases like measles are returning in different parts of Europe, partly as a result of the activities of the anti-vaccination movement. The herd immunity in most Western countries is high but even a small decrease in vaccination would have immediate negative effects for the population. Sweden offers a perfect site for future anti-vaccination studies due to its high vaccination covering. A decline in the numbers of children vaccinated has had immediate effects. For example, the incident rate in the country of pertussis rose from 700 cases to 3,200 cases per 100,000 children in 4 years due to a rather small decrease in vaccinations. This constitutes a strong argument for the civic importance of the case. The aim of this presentation is to introduce a new 4-year research project (2020–2023), independently financed by the Bank of Sweden Foundation (Riksbankens jubileumsfond), with the goal to investigate the role and importance of rumouring for the vaccination skepticism growing on the internet, and how it can be understood as an expression of civic engagement in the present digital times entailing crucial transformations for everyday civic culture. Theoretically, the project builds upon, and develop, media researcher Dahlgren’s work on civic culture and Kitta’s studies of the anti-vaccination movement. The overarching research question is: How have the everyday practice and experience of, and the conditions for, rumours been shaped and reshaped in the digital age, and what do these processes mean for civic engagement and participation? The project will offer an understanding of how everyday interaction on the internet has a powerful impact on the spreading of false information, which in the long run may challenge democracy. On a more concrete level the project will answer the following questions in relation to the case of vaccine skepticism: How are rumours about alleged risks and dangers of vaccination propagated and established on the internet? Are there specific patterns and correlations connecting topics, assumptions, myths, argumentation schemes, popularity and time? What do everyday practices, on- and offline, of rumouring mean for its adherents’ civic engagement in the anti-vaccination movement? Which are the civic implications of the spreading and circulation of vaccination hostile rumours on individual citizens and society at large?}, booktitle = {8th European Communication Conference (ECREA)}, author = {Hammarlin, Mia-Marie and Miegel, Fredrik and Borin, Lars and Kokkinakis, Dimitrios and Jaakonaho, Anna}, year = {2021}, } @inProceedings{rouces-etal-2019-tracking-281308, title = {Tracking Attitudes Towards Immigration in Swedish Media}, abstract = {We use a gold standard under construction for sentiment analysis in Swedish to explore how attitudes towards immigration change across time and media. We track the evolution of attitude starting from the year 2000 for three different Swedish media: the national newspapers Aftonbladet and Svenska Dagbladet, representing different halves of the left–right political spectrum, and the online forum Flashback.}, booktitle = {CEUR Workshop Proceedings (Vol. 2364). Digital Humanities in the Nordic Countries 4th Conference, Copenhagen, Denmark, March 5-8, 2019. }, author = {Rouces, Jacobo and Borin, Lars and Tahmasebi, Nina}, year = {2019}, publisher = {CEUR Workshop Proceedings}, address = {Aachen }, } @inProceedings{rouces-etal-2019-political-281307, title = {Political Stance Analysis Using Swedish Parliamentary Data}, abstract = {We process and visualize Swedish parliamentary data using methods from statistics and machine learning, which allows us to obtain insight into the political processes behind the data. We produce plots that let us infer the relative stance of political parties and their members on different topics. In addition, we can infer the degree of homogeneity of individual votes within different parties, as well as the degree of multi-dimensionality of Swedish politics.}, booktitle = {CEUR Workshop Proceedings (Vol. 2364). Digital Humanities in the Nordic Countries 4th Conference, Copenhagen, Denmark, March 5-8, 2019.}, author = {Rouces, Jacobo and Borin, Lars and Tahmasebi, Nina}, year = {2019}, publisher = {CEUR }, address = {Aachen }, } @inProceedings{rouces-etal-2018-generating-264719, title = {Generating a Gold Standard for a Swedish Sentiment Lexicon}, abstract = {We create a gold standard for sentiment annotation of Swedish terms, using the freely available SALDO lexicon and the Gigaword corpus. For this purpose, we employ a multi-stage approach combining corpus-based frequency sampling, direct score annotation and Best-Worst Scaling. In addition to obtaining a gold standard, we analyze the data from our process and we draw conclusions about the optimal sentiment model.}, booktitle = {LREC 2018, Eleventh International Conference on Language Resources and Evaluation, May 7-12, 2018, Miyazaki (Japan)}, author = {Rouces, Jacobo and Tahmasebi, Nina and Borin, Lars and Rødven-Eide, Stian}, year = {2018}, publisher = {ELRA}, address = {Miyazaki}, ISBN = {979-10-95546-00-9}, } @inProceedings{rouces-etal-2018-sensaldo-264720, title = {SenSALDO: Creating a Sentiment Lexicon for Swedish}, abstract = {The natural language processing subfield known as sentiment analysis or opinion mining has seen an explosive expansion over the last decade or so, and sentiment analysis has become a standard item in the NLP toolbox. Still, many theoretical and methodological questions remain unanswered and resource gaps unfilled. Most work on automated sentiment analysis has been done on English and a few other languages; for most written languages of the world, this tool is not available. This paper describes the development of an extensive sentiment lexicon for written (standard) Swedish. We investigate different methods for developing a sentiment lexicon for Swedish. We use an existing gold standard dataset for training and testing. For each word sense from the SALDO Swedish lexicon, we assign a real value sentiment score in the range [-1,1] and produce a sentiment label. We implement and evaluate three methods: a graph-based method that iterates over the SALDO structure, a method based on random paths over the SALDO structure and a corpus-driven method based on word embeddings. The resulting sense-disambiguated sentiment lexicon (SenSALDO) is an open source resource and freely available from Språkbanken, The Swedish Language Bank at the University of Gothenburg.}, booktitle = {LREC 2018, Eleventh International Conference on Language Resources and Evaluation, 7-12 May 2018, Miyazaki (Japan)}, author = {Rouces, Jacobo and Tahmasebi, Nina and Borin, Lars and Rødven-Eide, Stian}, year = {2018}, publisher = {ELRA}, address = {Miyazaki}, ISBN = {979-10-95546-00-9}, } @inProceedings{rouces-etal-2018-defining-264721, title = {Defining a gold standard for a Swedish sentiment lexicon: Towards higher-yield text mining in the digital humanities}, abstract = {There is an increasing demand for multilingual sentiment analysis, and most work on sentiment lexicons is still carried out based on English lexicons like WordNet. In addition, many of the non-English sentiment lexicons that do exist have been compiled by (machine) translation from English resources, thereby arguably obscuring possible language-specific characteristics of sentiment-loaded vocabulary. In this paper we describe the creation from scratch of a gold standard for the sentiment annotation of Swedish terms as a first step towards the creation of a full-fledged sentiment lexicon for Swedish.}, booktitle = {CEUR Workshop Proceedings vol. 2084. Proceedings of the Digital Humanities in the Nordic Countries 3rd Conference Helsinki, Finland, March 7-9, 2018. Edited by Eetu Mäkelä Mikko Tolonen Jouni Tuominen }, author = {Rouces, Jacobo and Borin, Lars and Tahmasebi, Nina and Rødven-Eide, Stian}, year = {2018}, publisher = {University of Helsinki, Faculty of Arts}, address = {Helsinki}, } @edited_book{alfter-etal-2021-proceedings-311727, title = {Proceedings of the 10th Workshop on Natural Language Processing for Computer Assisted Language Learning (NLP4CALL 2021)}, abstract = {The workshop series on Natural Language Processing (NLP) for Computer-Assisted Language Learning (NLP4CALL) is a meeting place for researchers working on the integration of Natural Language Processing and Speech Technologies in CALL systems and exploring the theoretical and methodological issues arising in this connection. The latter includes, among others, the integration of insights from Second Language Acquisition (SLA) research, and the promotion of “Computational SLA” through setting up Second Language research infrastructures.}, editor = {Alfter, David and Volodina, Elena and Pilán, Ildikó and Graën, Johannes and Borin, Lars}, year = {2021}, publisher = {Linköping Electronic Conference Proceedings 177}, address = {Linköping, Sweden}, ISBN = {978-91-7929-625-4}, } @incollection{linden-etal-2021-multilingual-311386, title = {A multilingual net of lexical resources}, booktitle = {The Swedish FrameNet++: Harmonization, integration, method development and practical language technology applications / editor(s): Dana Dannélls, Lars Borin and Karin Friberg Heppin}, author = {Lindén, Krister and Niemi, Jyrki and Borin, Lars and Forsberg, Markus and Pedersen, Bolette S. and Nimb, Sanni and Orav, Heili and Kahusk, Neeme and Vider, Kadri}, year = {2021}, publisher = {John Benjamins}, address = {Amsterdam}, ISBN = {9789027209900}, pages = {123–137}, } @incollection{dannells-etal-2021-swedish-310041, title = {Swedish FrameNet}, abstract = {This chapter describes the development of Swedish FrameNet. A new framenet project often follows one of two methodological approaches: (1) extension, through translation of a different-language – often English – framenet into the target language, and (2) merging, where the resource is built from scratch in the target language. Both approaches have their pros and cons, which have been extensively discussed in the literature. Swedish FrameNet is mainly developed through the extension approach, although balanced with the merging approach. Drawing on the two approaches simultaneously, we describe how integrated language resources and tools have been exploited to create and develop Swedish FrameNet: how it was constructed, what it contains, and the basic assumptions underlying the annotation of its contents. }, booktitle = {The Swedish FrameNet++. Harmonization, integration, method development and practical language technology applications}, author = {Dannélls, Dana and Borin, Lars and Forsberg, Markus and Friberg Heppin, Karin and Toporowska Gronostaj, Maria}, year = {2021}, publisher = {John Benjamins Publishing Company}, address = {Amsterdam / Philadelphia}, ISBN = {978 90 272 5848 9}, pages = {37 -- 66}, } @misc{tahmasebi-etal-2019-proceedings-285886, title = {Proceedings of the 1st International Workshop on Computational Approaches to Historical Language Change, August 2, 2019, Florence, Italy}, author = {Tahmasebi, Nina and Borin, Lars and Jatowt, Adam and Xu, Yang}, year = {2019}, publisher = {Association for Computational Linguistics}, address = {Stroudsburg, PA}, ISBN = {978-1-950737-31-4}, } @incollection{tahmasebi-etal-2021-survey-307058, title = {Survey of computational approaches to lexical semantic change detection}, abstract = {Our languages are in constant flux driven by external factors such as cultural, societal and technological changes, as well as by only partially understood internal motivations. Words acquire new meanings and lose old senses, new words are coined or borrowed from other languages and obsolete words slide into obscurity. Understanding the characteristics of shifts in the meaning and in the use of words is useful for those who work with the content of historical texts, the interested general public, but also in and of itself. The findings from automatic lexical semantic change detection and the models of diachronic conceptual change are also currently being incorporated in approaches for measuring document across-time similarity, information retrieval from long-term document archives, the design of OCR algorithms, and so on. In recent years we have seen a surge in interest in the academic community in computational methods and tools supporting inquiry into diachronic conceptual change and lexical replacement. This article provides a comprehensive survey of recent computational techniques to tackle both.}, booktitle = {Computational approaches to semantic change / Nina Tahmasebi, Lars Borin, Adam Jatowt, Yang Xu, Simon Hengchen (eds.) }, author = {Tahmasebi, Nina and Borin, Lars and Jatowt, Adam}, year = {2021}, publisher = { Language Science Press}, address = {Berlin}, ISBN = {978-3-96110-312-6 }, pages = {1--91}, } @incollection{adesam-etal-2021-lexical-310933, title = {A lexical resource for computational historical linguistics}, abstract = {In this chapter we present the diachronic dimension of Swedish FrameNet++. We describe the historical lexical resources currently available for Swedish, linked to the Contemporary Swedish lexicon Saldo. We present a case study of how interlinking the dictionaries simultaneously allows us to study lexical change. We also present a method of linking text words to lexicon entries, facilitating interactive exploration of historical texts. Diachronical language resources present both a high-variation challenge from a wider language technology perspective, and an interesting object of linguistic study. While a number of improvements of the parts of the diachronic lexical macroresource are still needed, this resource is invaluable for analysing and accessing historical texts, as well as for both synchronic historical and diachronic lexical studies.}, booktitle = {The Swedish FrameNet++. Harmonization, integration, method development and practical language technology applications}, author = {Adesam, Yvonne and Andersson, Peter and Borin, Lars and Bouma, Gerlof}, year = {2021}, publisher = {John Benjamins Publishing Company}, address = {Amsterdam / Philadelphia}, ISBN = {978 90 272 5848 9}, pages = {98–121}, } @inProceedings{fridlund-etal-2019-trawling-287968, title = {Trawling for Terrorists: A Big Data Analysis of Conceptual Meanings and Contexts in Swedish Newspapers, 1780–1926}, abstract = {The conceptual history of terrorism has to a significant extent been studied through canonical texts or historical key figures or organisations. However, through the increasing digitization of text materials convential research questions can now be approached from new angles or established results verified on the basis of exhaustive collections of data, rather than limited samples. Specifically, we are interested in evaluating and expanding on prior research claims regarding the meanings and con- texts associated with the concepts terrorism and terrorist up until the twentieth century in a Swedish context. The investigation is guided by the following research questions: What historical meanings of the concept of terrorism were expressed in the Swedish newspaper discourse? What social and ideological contexts and violent political practices was the concept primarily associated with before the First World War?}, booktitle = {Proceedings of the 5th International Workshop on Computational History (HistoInformatics 2019) co-located with the 23rd International Conference on Theory and Practice of Digital Libraries (TPDL 2019) Oslo, Norway, September 12th, 2019, Melvin Wevers, Mohammed Hasanuzzaman, Gaël Dias, Marten Düring, & Adam Jatowt, eds. }, author = {Fridlund, Mats and Olsson, Leif-Jöran and Brodén, Daniel and Borin, Lars}, year = {2019}, publisher = {CEUR-WS}, address = {Aachen}, } @article{borin-etal-2021-bird's-309082, title = {A bird’s-eye view on South Asian languages through LSI: Areal or genetic relationships?}, abstract = {We present initial exploratory work on illuminating the long-standing question of areal versus genealogical connections in South Asia using computational data visualization tools. With respect to genealogy, we focus on the subclassification of Indo-Aryan, the most ubiquitous language family of South Asia. The intent here is methodological: we explore computational methods for visualizing large datasets of linguistic features, in our case 63 features from 200 languages representing four language families of South Asia, coming out of a digitized version of Grierson’s Linguistic Survey of India. To this dataset we apply phylogenetic software originally developed in the context of computational biology for clustering the languages and displaying the clusters in the form of networks. We further explore multiple correspondence analysis as a way of illustrating how linguistic feature bundles correlate with extrinsically defined groupings of languages (genealogical and geographical). Finally, map visualization of combinations of linguistic features and language genealogy is suggested as an aid in distinguishing genealogical and areal features. On the whole, our results are in line with the conclusions of earlier studies: Areality and genealogy are strongly intertwined in South Asia, the traditional lower-level subclassification of Indo-Aryan is largely upheld, and there is a clearly discernible areal east–west divide cutting across language families.}, journal = {Journal of South Asian Languages and Linguistics}, author = {Borin, Lars and Saxena, Anju and Virk, Shafqat and Comrie, Bernard}, year = {2021}, volume = {7}, number = {2}, pages = {151--185}, } @incollection{jatowt-etal-2021-computational-307061, title = {Computational approaches to lexical semantic change: Visualization systems and novel applications}, abstract = {The purpose of this chapter is to survey visualization and user interface solutions for understanding lexical semantic change as well as to survey a number of applications of techniques developed in computational analysis of lexical semantic change. We first overview approaches aiming to develop systems that support understanding semantic change in an interactive and visual way. It is generally accepted that computational techniques developed for analyzing and uncovering semantic change are beneficial to linguists, historians, sociologists, and practitioners in numerous related fields, especially within the humanities. However, quite a few non-professional users are equally interested in the histories of words. Developing interactive, visual, engaging, and easy-to-understand systems can help them to acquire relevant knowledge. Second, we believe that other fields could benefit from the research outcomes of computational approaches to lexical semantic change. In general, properly representing the meaning of terms used in the past should be important for a range of natural language processing, information retrieval and other tasks that operate on old texts. In the latter part of the chapter, we then focus on current and potential applications related to computer and information science with the underlying question: “How can modeling semantic change benefit wider downstream applications in these disciplines?”}, booktitle = {Computational approaches to semantic change }, author = {Jatowt, Adam and Tahmasebi, Nina and Borin, Lars}, year = {2021}, publisher = { Language Science Press}, address = {Berlin}, ISBN = {978-3-96110-312-6}, pages = {311--339}, } @edited_book{tahmasebi-etal-2021-computational-306968, title = {Computational approaches to semantic change}, abstract = {Semantic change — how the meanings of words change over time — has preoccupied scholars since well before modern linguistics emerged in the late 19th and early 20th century, ushering in a new methodological turn in the study of language change. Compared to changes in sound and grammar, semantic change is the least understood. Ever since, the study of semantic change has progressed steadily, accumulating a vast store of knowledge for over a century, encompassing many languages and language families. Historical linguists also early on realized the potential of computers as research tools, with papers at the very first international conferences in computational linguistics in the 1960s. Such computational studies still tended to be small-scale, method-oriented, and qualitative. However, recent years have witnessed a sea-change in this regard. Big-data empirical quantitative investigations are now coming to the forefront, enabled by enormous advances in storage capability and processing power. Diachronic corpora have grown beyond imagination, defying exploration by traditional manual qualitative methods, and language technology has become increasingly data-driven and semantics-oriented. These developments present a golden opportunity for the empirical study of semantic change over both long and short time spans. A major challenge presently is to integrate the hard-earned knowledge and expertise of traditional historical linguistics with cutting-edge methodology explored primarily in computational linguistics. The idea for the present volume came out of a concrete response to this challenge. The 1st International Workshop on Computational Approaches to Historical Language Change (LChange'19), at ACL 2019, brought together scholars from both fields. This volume offers a survey of this exciting new direction in the study of semantic change, a discussion of the many remaining challenges that we face in pursuing it, and considerably updated and extended versions of a selection of the contributions to the LChange'19 workshop, addressing both more theoretical problems — e.g., discovery of "laws of semantic change" — and practical applications, such as information retrieval in longitudinal text archives.}, editor = {Tahmasebi, Nina and Borin, Lars and Jatowt, Adam and Xu, Yang and Hengchen, Simon}, year = {2021}, publisher = {Language Science Press}, address = {Berlin}, ISBN = {978-3-98554-008-2}, } @misc{alfter-etal-2020-proceedings-300071, title = {Proceedings of the 9th Workshop on Natural Language Processing for Computer Assisted Language Learning 2020}, abstract = {The workshop series on Natural Language Processing (NLP) for Computer-Assisted Language Learning (NLP4CALL) is a meeting place for researchers working on the integration of Natural Language Processing and Speech Technologies in CALL systems and exploring the theoretical and methodological issues arising in this connection. The latter includes, among others, the integration of insights from Second Language Acquisition (SLA) research, and the promotion of “Computational SLA” through setting up Second Language research infrastructures. This collection presents four selected papers describing use of Language Technology for language learning.}, author = {Alfter, David and Volodina, Elena and Pilán, Ildikó and Lange, Herbert and Borin, Lars}, year = {2020}, publisher = {Linköping University Electronic Press}, address = {Linköping}, ISBN = {978-91-7929-732-9}, } @inProceedings{lindahl-etal-2019-towards-286588, title = {Towards Assessing Argumentation Annotation - A First Step}, abstract = {This paper presents a first attempt at using Walton’s argumentation schemes for annotating arguments in Swedish political text and assessing the feasibility of using this particular set of schemes with two linguistically trained annotators. The texts are not pre-annotated with argumentation structure beforehand. The results show that the annotators differ both in number of annotated arguments and selection of the conclusion and premises which make up the arguments. They also differ in their labeling of the schemes, but grouping the schemes increases their agreement. The outcome from this will be used to develop guidelines for future annotations.}, booktitle = {Proceedings of the 6th Workshop on Argument Mining, August 1, 2019, Florence, Italy / Benno Stein, Henning Wachsmuth (Editors)}, author = {Lindahl, Anna and Borin, Lars and Rouces, Jacobo}, year = {2019}, publisher = {Association for Computational Linguistics}, address = {Stroudsburg, PA}, ISBN = {978-1-950737-33-8}, } @inProceedings{rouces-etal-2020-creating-290695, title = {Creating an Annotated Corpus for Aspect-Based Sentiment Analysis in Swedish}, abstract = {Aspect-Based Sentiment Analysis constitutes a more fine-grained alternative to traditional sentiment analysis at sentence level. In addition to a sentiment value denoting how positive or negative a particular opinion or sentiment expression is, it identifies additional aspects or 'slots' that characterize the opinion. Some typical aspects are target and source, i.e. who holds the opinion and about which entity or aspect is the opinion. We present a large Swedish corpus annotated for Aspect-Based Sentiment Analysis. Each sentiment expression is annotated as a tuple that contains the following fields: one among 5 possible sentiment values, the target, the source, and whether the sentiment expressed is ironic. In addition, the linguistic element that conveys the sentiment is identified too. Sentiment for a particular topic is also annotated at title, paragraph and document level. The documents are articles obtained from two Swedish media (Svenska Dagbladet and Aftonbladet) and one online forum (Flashback), totalling around 4000 documents. The corpus is freely available and we plan to use it for training and testing an Aspect-Based Sentiment Analysis system.}, booktitle = {Proceedings of the 5th conference in Digital Humanities in the Nordic Countries, Riga, Latvia, October 21-23, 2020.}, author = {Rouces, Jacobo and Borin, Lars and Tahmasebi, Nina}, year = {2020}, publisher = {CEUR Workshop Proceedings}, } @inProceedings{dannells-borin-2012-toward-156502, title = {Toward language independent methodology for generating artwork descriptions – Exploring FrameNet information}, abstract = {Today museums and other cultural heritage institutions are increasingly storing object descriptions using semantic web domain ontologies. To make this content accessible in a multilingual world, it will need to be conveyed in many languages, a language generation task which is domain specific and language dependent. This paper describes how semantic and syntactic information such as that provided in a framenet can contribute to solving this task. It is argued that the kind of information offered by such lexical resources enhances the output quality of a multilingual language generation application, in particular when generating domain specific content. }, booktitle = {EACL 2012 workshop on Language Technology for Cultural Heritage, Social Sciences, and Humanities (LaTeCH)}, author = {Dannélls, Dana and Borin, Lars}, year = {2012}, } @inProceedings{zechner-borin-2020-towards-296900, title = {Towards a Swedish Roget-Style Thesaurus for NLP}, abstract = {Bring’s thesaurus (Bring) is a Swedish counterpart of Roget, and its digitized version could make a valuable language resource for use in many and diverse natural language processing (NLP) applications. Fromlexicon, word sense disambiguation, topic detection the literature we know that Roget-style thesauruses and wordnets have complementary strengths in this context, so both kinds of lexical-semantic resource are good to have. However, Bring was published in 1930, and its lexical items are in the form of lemma–POS pairings. In order to be useful in our NLP systems, polysemous lexical items need to be disambiguated, and a large amount of modern vocabulary must be added in the proper places in Bring. The work presented here describes experiments aiming at automating these two tasks, at least in part, where we use the structure of an existing Swedish semantic lexicon – Saldo – both for disambiguation of ambiguous Bring entries and for addition of new entries to Bring.}, booktitle = {Proceedings of the 2020 Globalex Workshop on Linked Lexicography. Language Resources and Evaluation Conference (LREC 2020), Marseille, 11–16 May 2020}, author = {Zechner, Niklas and Borin, Lars}, year = {2020}, publisher = {European Language Resources Association}, address = {Paris}, ISBN = {979-10-95546-46-7}, } @inProceedings{waldispuhl-etal-2020-material-293332, title = {Material Philology Meets Digital Onomastic Lexicography: The NordiCon Database of Medieval Nordic Personal Names in Continental Sources}, abstract = {We present NordiCon, a database containing medieval Nordic personal names attested in Continental sources. The database combines formally interpreted and richly interlinked onomastic data with digitized versions of the medieval manuscripts from which the data originate and information on the tokens' context. The structure of NordiCon is inspired by other online historical given name dictionaries. It takes up challenges reported on in previous works, such as how to cover material properties of a name token and how to define lemmatization principles, and elaborates on possible solutions. The lemmatization principles for NordiCon are further developed in order to facilitate the connection to other name dictionaries and corpuses, and the integration of the database into SprÃ¥kbanken Text, an infrastructure containing modern and historical written data.}, booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference, Marseille, 11–16 May 2020 / editors: Nicoletta Calzolari... [et. al.]}, author = {Waldispühl, Michelle and Dannélls, Dana and Borin, Lars}, year = {2020}, publisher = {European Language Resources Association}, address = {Marseille}, ISBN = {979-10-95546-34-4}, } @inProceedings{virk-etal-2020-from-295339, title = {From Linguistic Descriptions to Language Profiles}, abstract = {Language catalogues and typological databases are two important types of resources containing different types of knowledge about the world’s natural languages. The former provide metadata such as number of speakers, location (in prose descriptions and/or GPS coordinates), language code, literacy, etc., while the latter contain information about a set of structural and functional attributes of languages. Given that both types of resources are developed and later maintained manually, there are practical limits as to the number of languages and the number of features that can be surveyed. We introduce the concept of a language profile, which is intended to be a structured representation of various types of knowledge about a natural language extracted semi-automatically from descriptive documents and stored at a central location. It has three major parts: (1) an introductory; (2) an attributive; and (3) a reference part, each containing different types of knowledge about a given natural language. As a case study, we develop and present a language profile of an example language. At this stage, a language profile is an independent entity, but in the future it is envisioned to become part of a network of language profiles connected to each other via various types of relations. Such a representation is expected to be suitable both for humans and machines to read and process for further deeper linguistic analyses and/or comparisons.}, booktitle = {Proceedings of the 7th Workshop on Linked Data in Linguistics (LDL-2020). Language Resources and Evaluation Conference (LREC 2020), Marseille, 11–16 May 2020 / Edited by : Maxim Ionov, John P. McCrae, Christian Chiarcos, Thierry Declerck, Julia Bosque-Gil, and Jorge Gracia}, author = {Virk, Shafqat and Hammarström, Harald and Borin, Lars and Forsberg, Markus and Wichmann, Søren}, year = {2020}, publisher = {European Language Resources Association}, address = {Paris}, ISBN = {979-10-95546-36-8}, } @inProceedings{borin-etal-2018-language-290841, title = {Language technology for digital linguistics: Turning the Linguistic Survey of India into a rich source of linguistic information}, abstract = {We present our work aiming at turning the linguistic material available in Grierson’s classical Linguistic Survey of India (LSI) from a printed discursive textual description into a formally structured digital language resource, a database suitable for a broad array of linguistic investigations of the languages of South Asia. While doing so, we develop state-of-the-art language technology for automatically extracting the relevant grammatical information from the text of the LSI, and interactive linguistic information visualization tools for better analysis and comparisons of languages based on their structural and functional features.}, booktitle = {Lecture Notes in Computer Science. Computational Linguistics and Intelligent Text Processing, 18th International Conference, CICLing 2017, Budapest, Hungary, April 17–23, 2017}, author = {Borin, Lars and Virk, Shafqat and Saxena, Anju}, year = {2018}, publisher = {Springer}, address = {Cham}, } @inProceedings{virk-etal-2019-exploiting-290903, title = {Exploiting frame semantics and frame-semantic parsing for automatic extraction of typological information from descriptive grammars of natural languages}, abstract = {We describe a novel system for automatic extraction of typological linguistic information from descriptive grammars of natural languages, applying the theory of frame semantics in the form of frame-semantic parsing. The current proof-of-concept system covers a few selected linguistic features, but the methodology is general and can be extended not only to other typological features but also to descriptive grammars written in languages other than English. Such a system is expected to be a useful assistance for automatic curation of typological databases which otherwise are built manually, a very labor and time consuming as well as cognitively taxing enterprise.}, booktitle = {12th International Conference on Recent Advances in Natural Language Processing, RANLP 2019, Varna, Bulgaria, 2-4 September 2019}, author = {Virk, Shafqat and Muhammad, Azam Sheikh and Borin, Lars and Aslam, Muhammad Irfan and Iqbal, Saania and Khurram, Nazia}, year = {2019}, publisher = {INCOMA Ltd.}, address = {Shoumen, Bulgaria}, ISBN = {978-954-452-055-7}, } @misc{alfter-etal-2019-proceedings-285613, title = {Proceedings of the 8th Workshop on Natural Language Processing for Computer Assisted Language Learning (NLP4CALL 2019), September 30, Turku Finland}, abstract = {The workshop series on Natural Language Processing (NLP) for Computer-Assisted Language Learning (NLP4CALL) is a meeting place for researchers working on the integration of Natural Language Processing and Speech Technologies in CALL systems and exploring the theoretical and methodological issues arising in this connection. The latter includes, among others, insights from Second Language Acquisition (SLA) research, on the one hand, and promote development of “Computational SLA” through setting up Second Language research infrastructure(s), on the other. The intersection of Natural Language Processing (or Language Technology / Computational Linguistics) and Speech Technology with Computer-Assisted Language Learning (CALL) brings “understanding” of language to CALL tools, thus making CALL intelligent. This fact has given the name for this area of research – Intelligent CALL, ICALL. As the definition suggests, apart from having excellent knowledge of Natural Language Processing and/or Speech Technology, ICALL researchers need good insights into second language acquisition theories and practices, as well as knowledge of second language pedagogy and didactics. This workshop invites therefore a wide range of ICALL-relevant research, including studies where NLP-enriched tools are used for testing SLA and pedagogical theories, and vice versa, where SLA theories, pedagogical practices or empirical data are modeled in ICALL tools. The NLP4CALL workshop series is aimed at bringing together competences from these areas for sharing experiences and brainstorming around the future of the field. }, author = {Alfter, David and Volodina, Elena and Borin, Lars and Pilán, Ildikó and Lange, Herbert}, year = {2019}, publisher = {Linköping University Electronic Press, Linköpings universitet}, address = {Linköping}, ISBN = {978-91-7929-998-9}, } @inProceedings{alfter-etal-2019-larka-281344, title = {Lärka: From Language Learning Platform to Infrastructure for Research on Language Learning}, abstract = {Lärka is an Intelligent Computer-Assisted Language Learning (ICALL) platform developed at Språkbanken, as a flexible and a valuable source of additional learning material (e.g. via corpusbased exercises) and a support tool for both teachers and L2 learners of Swedish and students of (Swedish) linguistics. Nowadays, Lärka is being adapted into a building block in an emerging second language research infrastructure within a larger context of the text-based research infrastructure developed by the national Swedish Language bank, Språkbanken, and SWE-CLARIN. Lärka has recently received a new responsive user interface adapted to different devices with different screen sizes. Moreover, the system has also been augmented with new functionalities. These recent additions aim at improving the usability and the usefulness of the platform for pedagogical purposes. The most important development, though, is the adaptation of the platform to serve as a component in an e-infrastructure supporting research on language learning and multilingualism. Thanks to Lärka’s service-oriented architecture, most functionalities are also available as web services which can be easily re-used by other applications.}, booktitle = {Linköping Electronic Conference Proceedings}, author = {Alfter, David and Borin, Lars and Pilán, Ildikó and Lindström Tiedemann, Therese and Volodina, Elena}, year = {2019}, publisher = {Linköping University Press}, address = {Linköping}, ISBN = {978-91-7685-034-3}, } @techreport{ljunglof-etal-2019-assessing-281222, title = {Assessing the quality of Språkbanken’s annotations}, abstract = {Most of the corpora in Språkbanken Text consist of unannotated plain text, such as almost all newspaper texts, social media texts, novels and official documents. We also have some corpora that are manually annotated in different ways, such as Talbanken (annotated for part-of-speech and syntactic structure), and the Stockholm Umeå Corpus (annotated for part-of-speech). Språkbanken’s annotation pipeline Sparv aims to automatise the work of automatically annotating all our corpora, while still keeping the manual annotations intact. When all corpora are annotated, they can be made available, e.g., in the corpus searh tools Korp and Strix. Until now there has not been any comprehensive overview of the annotation tools and models that Sparv has been using for the last eight years. Some of them have not been updated since the start, such as the part-of-speech tagger Hunpos and the dependency parser MaltParser. There are also annotation tools that we still have not included, such as a constituency-based parser. Therefore Språkbanken initiated a project with the aim of conducting such an overview. This document is the outcome of that project, and it contains descriptions of the types of manual and automatic annotations that we currently have in Språkbanken, as well as an incomplete overview of the state-of-the-art with regards to annotation tools and models. }, author = {Ljunglöf, Peter and Zechner, Niklas and Nieto Piña, Luis and Adesam, Yvonne and Borin, Lars}, year = {2019}, } @misc{pilan-etal-2018-proceedings-275358, title = {Proceedings of the 7th Workshop on NLP for Computer Assisted Language Learning (NLP4CALL 2018), SLTC, Stockholm, 7th November 2018 }, abstract = {The primary goal of the workshop series on Natural Language Processing for Computer-Assisted Language Learning (NLP4CALL) is to create a meeting place for researchers working on the integration of Natural Language Processing and Speech Technologies in CALL systems and exploring the theoretical and methodological issues arising in this connection. The latter includes, among others, insights from Second Language Acquisition (SLA) research, on the one hand, and promoting the development of “Computational SLA” through setting up Second Language research infrastructure(s), on the other. The intersection of Natural Language Processing (or Language Technology / Computational Linguistics) and Speech Technology with Computer-Assisted Language Learning (CALL) brings “understanding” of language to CALL tools, thus making CALL intelligent. This fact has given the name for this area of research – Intelligent CALL, ICALL. As the definition suggests, apart from having excellent knowledge of Natural Language Processing and/or Speech Technology, ICALL researchers need good insights into second language acquisition theories and practices, as well as knowledge of second language pedagogy and didactics. This workshop invites therefore a wide range of ICALL-relevant research, including studies where NLP-enriched tools are used for testing SLA and pedagogical theories, and vice versa, where SLA theories, pedagogical practices or empirical data are modeled in ICALL tools. The NLP4CALL workshop series is aimed at bringing together competencies from these areas for sharing experiences and brainstorming around the future of the field.}, author = {Pilán, Ildikó and Volodina, Elena and Alfter, David and Borin, Lars}, year = {2018}, publisher = {Linköping University Electronic Press}, address = {Linköpings universitet}, ISBN = {978-91-7685-173-9}, } @inProceedings{alfter-etal-2018-from-275364, title = {From Language Learning Platform to Infrastructure for Research on Language Learning}, abstract = {Lärka is an Intelligent Computer-Assisted Language Learning (ICALL) platform developed at Språkbanken, as a flexible and a valuable source of additional learning material (e.g. via corpus- based exercises) and a support tool for both teachers and L2 learners of Swedish and students of (Swedish) linguistics. Nowadays, Lärka is being adapted into a central building block in an emerging second language research infrastructure within a larger context of the text-based research infrastructure developed by the national Swedish Language bank, Språkbanken, and SWE-CLARIN.}, booktitle = {Proceedings of CLARIN-2018 conference, Pisa, Italy}, author = {Alfter, David and Borin, Lars and Pilán, Ildikó and Lindström Tiedemann, Therese and Volodina, Elena}, year = {2018}, } @inProceedings{adesam-etal-2018-eukalyptus-273839, title = {The Eukalyptus Treebank of Written Swedish}, booktitle = {Seventh Swedish Language Technology Conference (SLTC), Stockholm, 7–9 November 2018}, author = {Adesam, Yvonne and Bouma, Gerlof and Johansson, Richard and Borin, Lars and Forsberg, Markus}, year = {2018}, } @incollection{haugen-borin-2018-danish-267403, title = {Danish, Norwegian and Swedish}, booktitle = {The world's major languages}, editor = {Bernard Comrie}, author = {Haugen, Einar and Borin, Lars}, year = {2018}, publisher = {Routledge}, address = {London and New York}, ISBN = {9781138184824}, pages = {127--150}, } @incollection{borin-etal-2018-linguistics-269084, title = {Linguistics vs. language technology in constructicon building and use}, abstract = {In this chapter, we describe the close interaction of linguists and language technologists in the Swedish constructicon project. This kind of collaboration is not so common today, because of the way that language technology has developed in recent decades, but in our case the collaboration has been very successful, and constituted a genuine instance of cross-fertilization, where an evolving language technology infrastructure and a computational lexical macroresource described in the chapter has formed an integral part of the Swedish constructicon development environment, while at the same time the structured linguistic knowledge described in the constructicon has informed the language technology making up the infrastructure.}, booktitle = {Constructicography: Constructicon development across languages}, editor = {Benjamin Lyngfelt and Lars Borin and Kyoko Ohara and Tiago Timponi Torrent}, author = {Borin, Lars and Dannélls, Dana and Gruzitis, Normunds}, year = {2018}, publisher = {John Benjamins}, address = {Amsterdam}, ISBN = {9789027263865}, pages = {229--253}, } @edited_book{lyngfelt-etal-2018-constructicography-269082, title = {Constructicography: Constructicon development across languages}, abstract = {In constructionist theory, a constructicon is an inventory of constructions making up the full set of linguistic units in a language. In applied practice, it is a set of construction descriptions – a “dictionary of constructions”. The development of constructicons in the latter sense typically means combining principles of both construction grammar and lexicography, and is probably best characterized as a blend between the two traditions. We call this blend constructicography. The present volume is a comprehensive introduction to the emerging field of constructicography. After a general introduction follow six chapters presenting constructicon projects for English, German, Japanese, Brazilian Portuguese, Russian, and Swedish, respectively, often in relation to a framenet of the language. In addition, there is a chapter addressing the interplay between linguistics and language technology in constructicon development, and a final chapter exploring the prospects for interlingual constructicography. This is the first major publication devoted to constructicon development and it should be particularly relevant for those interested in construction grammar, frame semantics, lexicography, the relation between grammar and lexicon, or linguistically informed language technology. }, editor = {Lyngfelt, Benjamin and Borin, Lars and Ohara, Kyoko and Torrent, Tiago Timponi}, year = {2018}, publisher = {John Benjamins}, address = {Amsterdam}, ISBN = {9789027263865}, } @incollection{lyngfelt-etal-2018-constructicography-269085, title = {Constructicography at work: Theory meets practice in the Swedish constructicon}, abstract = {This chapter addresses central topics in constructicography from the viewpoint of the Swedish constructicon project (SweCcn), focusing on practical constructicon development. The full process of construction description is described and discussed, from selection via corpus analysis to finished constructicon entry and beyond, towards structuring the set of entries into a network. Particular attention is given to the description format and the treatment of constructional variation. A main theme in the chapter is the interdependence and alignment of SweCcn and related resources, on the one hand in the local context, notably the infrastructure of Språkbanken (the Swedish language bank), and on the other hand with respect to corresponding resources for other languages. Of key concern is the relation to FrameNet, both the Swedish and other framenets, and a major section is devoted to conditions for linking constructions and frames.}, booktitle = {Constructicography: Constructicon development across languages}, editor = {Benjamin Lyngfelt and Lars Borin and Kyoko Ohara and Tiago Timponi Torrent}, author = {Lyngfelt, Benjamin and Bäckström, Linnéa and Borin, Lars and Ehrlemark, Anna and Rydstedt, Rudolf}, year = {2018}, publisher = {John Benjamins}, address = {Amsterdam}, ISBN = {9789027263865}, pages = {41--106}, } @incollection{borin-edlund-2018-language-269047, title = {Language technology and 3rd wave HCI: Towards phatic communication and situated interaction}, abstract = {In the field of language technology, researchers are starting to pay more attention to various interactional aspects of language – a development prompted by a confluence of factors, and one which applies equally to the processing of written and spoken language. Notably, the so-called ‘phatic’ aspects of linguistic communication are coming into focus in this work, where linguistic interaction is increasingly recognized as being fundamentally situated. This development resonates well with the concerns of third wave HCI, which involves a shift in focus from stating the requirements on HCI design primarily in terms of “context-free” information flow, to a view where it is recognized that HCI – just like interaction among humans – is indissolubly embedded in complex, shifting contexts. These – together with the different backgrounds and intentions of interaction participants – shape the interaction in ways which are not readily understandable in terms of rational information exchange, but which are nevertheless central aspects of the interaction, and which therefore must be taken into account in HCI design, including its linguistic aspects, forming the focus of this chapter.}, booktitle = {New Directions in Third Wave Human-Computer Interaction: Volume 1 - Technologies}, editor = {Michael Filimowicz and Veronika Tzankova.}, author = {Borin, Lars and Edlund, Jens}, year = {2018}, publisher = {Springer International Publishing}, address = {Cham}, ISBN = {978-3-319-73355-5}, pages = {251--264}, } @inProceedings{malm-etal-2018-lingfn-267404, title = {LingFN: Towards a framenet for the linguistics domain}, abstract = {Framenets and frame semantics have proved useful for a number of natural language processing (NLP) tasks. However, in this connection framenets have often been criticized for limited coverage. A proposed reasonable-effort solution to this problem is to develop domain-specific (sublanguage) framenets to complement the corresponding general-language framenets for particular NLP tasks, and in the literature we find such initiatives covering, e.g., medicine, soccer, and tourism. In this paper, we report on our experiments and first results on building a framenet to cover the terms and concepts encountered in descriptive linguistic grammars. A contextual statistics based approach is used to judge the polysemous nature of domain-specific terms, and to design new domain-specific frames. The work is part of a more extensive research undertaking where we are developing NLP methodologies for automatic extraction of linguistic information from traditional linguistic descriptions to build typological databases, which otherwise are populated using a labor intensive manual process.}, booktitle = {Proceedings : LREC 2018 Workshop, International FrameNet Workshop 2018. Multilingual Framenets and Constructicons, May 12, 2018, Miyazaki, Japan / Edited by Tiago Timponi Torrent, Lars Borin and Collin F. Baker}, author = {Malm, Per and Virk, Shafqat and Borin, Lars and Saxena, Anju}, year = {2018}, publisher = {ELRA}, address = {Miyazaki}, ISBN = {979-10-95546-04-7}, } @misc{torrent-etal-2018-proceedings-267405, title = {Proceedings of the LREC 2018 Workshop International FrameNet Workshop 2018: Multilingual Framenets and Constructicons. 12 May 2018 – Miyazaki, Japan}, abstract = {The International FrameNet Workshop 2018 brought together researchers in Frame Semantics and Construction Grammar, two areas which have traditionally been interrelated, but which have been developing somewhat independently in recent years. It is also addressed at language technology researchers working with language resources based on Frame Semantics or Construction Grammar. The workshop follows on from similar joint meetings in Berkeley, California in 2013 (IFNW 2013, sponsored by the Swedish FrameNet group) and in Juiz de Fora, Brazil in 2016 (IFNW 2016, sponsored by FrameNet Brasil), and will cover the rapidly unfolding developments in both areas and recent research on their interconnections.}, author = {Torrent, Tiago Timponi and Borin, Lars and Baker, Collin}, year = {2018}, publisher = {ELRA}, address = {Miyazaki}, ISBN = {979-10-95546-04-7}, } @inProceedings{borin-etal-2017-clarin-261157, title = {Swe-Clarin: Language resources and technology for Digital Humanities}, abstract = {CLARIN is a European Research Infrastructure Consortium (ERIC), which aims at (a) making extensive language-based materials available as primary research data to the humanities and social sciences (HSS); and (b) offering state-of-the-art language technology (LT) as an e-research tool for this purpose, positioning CLARIN centrally in what is often referred to as the digital humanities (DH). The Swedish CLARIN node Swe-Clarin was established in 2015 with funding from the Swedish Research Council. In this paper, we describe the composition and activities of Swe-Clarin, aiming at meeting the requirements of all HSS and other researchers whose research involves using text and speech as primary research data, and spreading the awareness of what Swe-Clarin can offer these research communities. We focus on one of the central means for doing this: pilot projects conducted in collaboration between HSS researchers and Swe-Clarin, together formulating a research question, the addressing of which requires working with large language-based materials. Four such pilot projects are described in more detail, illustrating research on rhetorical history, second-language acquisition, literature, and political science. A common thread to these projects is an aspiration to meet the challenge of conducting research on the basis of very large amounts of textual data in a consistent way without losing sight of the individual cases making up the mass of data, i.e., to be able to move between Moretti’s “distant” and “close reading” modes. While the pilot projects clearly make substantial contributions to DH, they also reveal some needs for more development, and in particular a need for document-level access to the text materials. As a consequence of this, work has now been initiated in Swe-Clarin to meet this need, so that Swe-Clarin together with HSS scholars investigating intricate research questions can take on the methodological challenges of big-data language-based digital humanities.}, booktitle = {Digital Humanities 2016. Extended Papers of the International Symposium on Digital Humanities (DH 2016) Växjö, Sweden, November, 7-8, 2016. Edited by Koraljka Golub, Marcelo Milra. Vol-2021}, author = {Borin, Lars and Tahmasebi, Nina and Volodina, Elena and Ekman, Stefan and Jordan, Caspar and Viklund, Jon and Megyesi, Beáta and Näsman, Jesper and Palmér, Anne and Wirén, Mats and Björkenstam, Kristina and Grigonyte, Gintare and Gustafson Capková, Sofia and Kosiński, Tomasz}, year = {2017}, publisher = {M. Jeusfeld c/o Redaktion Sun SITE, Informatik V, RWTH Aachen.}, address = {Aachen}, } @inProceedings{borin-etal-2018-many-267534, title = {Many a little makes a mickle - infrastructure component reuse for a massively multilingual linguistic study}, abstract = {We present ongoing work aiming at turning the linguistic material available in Grierson’s classical Linguistic Survey of India (LSI) into a digital language resource, a database suitable for a broad array of linguistic investigations of the languages of South Asia and studies relating to language typology and contact linguistics. The project has two concrete main aims: (1) to conduct a linguistic investigation of the claim that South Asia constitutes a linguistic area; (2) to develop state-of-the-art language technology for automatically extracting the relevant information from the text of the LSI. In this presentation we focus on how, in the first part of the project, a number of existing research infrastructure components provided by Swe-Clarin, the Swedish CLARIN consortium, have been ‘recycled’ in order to allow the linguists involved in the project to quickly orient themselves in the vast LSI material, and to be able to provide input to the language technologists designing the tools for information extraction from the descriptive grammars.}, booktitle = {Selected papers from the CLARIN Annual Conference 2017, Budapest, 18–20 September 2017}, author = {Borin, Lars and Virk, Shafqat and Saxena, Anju}, year = {2018}, publisher = {Linköping University Electronic Press}, address = {Linköping}, ISBN = {978-91-7685-273-6}, } @inProceedings{karsvall-borin-2018-sdhk-265603, title = {SDHK meets NER: Linking place names with medieval charters and historical maps}, booktitle = {CEUR Workshop Proceedings, vol. 2084. Proceedings of the Digital Humanities in the Nordic Countries 3rd Conference Helsinki, Finland, March 7-9, 2018. Edited by Eetu Mäkelä Mikko Tolonen Jouni Tuominen }, author = {Karsvall, Olof and Borin, Lars}, year = {2018}, publisher = {University of Helsinki, Faculty of Arts}, address = {Helsinki}, } @article{pilan-etal-2017-candidate-260382, title = {Candidate sentence selection for language learning exercises: From a comprehensive framework to an empirical evaluation}, abstract = {We present a framework and its implementation relying on Natural Language Processing methods, which aims at the identification of exercise item candidates from corpora. The hybrid system combining heuristics and machine learning methods includes a number of relevant selection criteria. We focus on two fundamental aspects: linguistic complexity and the dependence of the extracted sentences on their original context. Previous work on exercise generation addressed these two criteria only to a limited extent, and a refined overall candidate sentence selection framework appears also to be lacking. In addition to a detailed description of the system, we present the results of an empirical evaluation conducted with language teachers and learners which indicate the usefulness of the system for educational purposes. We have integrated our system into a freely available online learning platform.}, journal = {Revue Traitement Automatique des Langues. Special issue on NLP for Learning and Teaching}, author = {Pilán, Ildikó and Volodina, Elena and Borin, Lars}, year = {2017}, volume = {57}, number = {3}, pages = {67--91}, } @misc{volodina-etal-2017-preface-262846, title = {Preface. Proceedings of the Joint 6th Workshop on NLP for Computer Assisted Language Learning and 2nd Workshop on NLP for Research on Language Acquisition at NoDaLiDa 2017, Gothenburg, 22nd May 2017}, abstract = {For the second year in a row we brought two related themes of NLP for Computer-Assisted Language Learning and NLP for Language Acquisition together. The goal of organizing joint workshops is to provide a meeting place for researchers working on language learning issues including both empirical and experimental studies and NLP-based applications. The resulting volume covers a variety of topics from the two fields and - hopefully - showcases the challenges and achievements in the field. The seven papers in this volume cover native language identification in learner writings, using syntactic complexity development in language learner language to identify reading comprehension texts of appropriate level, exploring the potential of parallel corpora to predict mother-language specific problem areas for learners of another language, tools for learning languages - both well-resourced ones such as English as well as endangered or under-resourced ones such as Yakut and Võro, as well as exploring the potential of automatically identifying and correcting word-level errors in Swedish learner writing.}, author = {Volodina, Elena and Pilán, Ildikó and Borin, Lars and Grigonyte, Gintare and Nilsson Björkenstam, Kristina}, year = {2017}, volume = {30}, pages = {i--vi}, } @inProceedings{volodina-etal-2017-svalex-262848, title = {SVALex. En andraspråksordlista med CEFR-nivåer}, abstract = {När man planerar att utveckla en språkkurs i ett andra- eller främmandespråk (L2) ställs man inför utmaningen att definiera vilket ordförråd inlärarna behöver tillägna sig. Forskning inom andraspråksinlärning tyder på att läsaren behöver kunna 95–98 % av löporden i en text för att förstå den (Laufer & Ravenhorst-Kalovski 2010). Sådana studier är användbara för att uppskatta storleken på det ordförråd som behövs för att tillägna sig innehållet i en text, men de ger ingen närmare metodologisk vägledning för den som vill utveckla nivåstrukturerade läromedel eller kurser för andraspråksundervisning. Speciellt tydligt är detta inom CALL, Computer-Assisted Language Learning, där läromaterial (t.ex. övningar) genereras automatiskt, och behöver elektroniska resurser som kunskapskälla. Man kan istället angripa problemet från andra hållet. Om man har en samling nivåklassificerade texter för andraspråksinlärare kan man utifrån dem bygga ordlistor där varje ord är placerat på en färdighetsskala. Om man känner till den förutsatta färdighetsnivån hos läsaren, kan man helt enkelt anta att den textnivå där ett ord dyker upp första gången också anger ordets svårighetsgrad. SVALex är ett lexikon som har byggts enligt den principen. Resursen ska kunna användas av inlärare och lärare i svenska som andraspråk, men även av lexikografer, av kursutvecklare och provkonstruktörer samt av dem som likt oss själva ägnar sig åt utveckling av språkteknologibaserade datorstöd för språkinlärning och språktestning. SVALex utgör en vidareutveckling i förhållande till tidigare lexikonresurser för svenska som andraspråk (se avsnitt 2), genom att den konsekvent relaterar de 15 681 lexikoningångarna till en vida använd färdighetsskala för andra- och främmandespråksinlärning, Europarådets gemensamma europeiska referensram för språk (Common European Framework of Reference, i fortsättningen refererad till som CEFR) (Council of Europe 2001; Skolverket 2009). Nivåklassningen av lexikonenheterna i SVALex görs på basis av deras distribution i COCTAILL, en korpus innehållande lärobokstexter i svenska som andraspråk, där lärare har placerat in varje text i någon av CEFR-nivåerna (Volodina et al. 2014). }, booktitle = {Svenskans beskrivning. 35, Förhandlingar vid trettiofemte sammankomsten : Göteborg 11–13 maj 2016 / Redigerad av Emma Sköldberg, Maia Andréasson, Henrietta Adamsson Eryd, Filippa Lindahl, Sven Lindström, Julia Prentice & Malin Sandberg}, author = {Volodina, Elena and Borin, Lars and Pilán, Ildikó and François, Thomas and Tack, Annaïs}, year = {2017}, publisher = {Göteborgs universitet}, address = {Göteborg}, ISBN = {978-91-87850-64-6}, } @misc{volodina-etal-2017-proceedings-262838, title = {Proceedings of the Joint 6th Workshop on NLP for Computer Assisted Language Learning and 2nd Workshop on NLP for Research on Language Acquisition at NoDaLiDa 2017, Gothenburg, 22nd May 2017}, abstract = {For the second year in a row we have brought the two related themes of NLP for Computer-Assisted Language Learning and NLP for Language Acquisition together under one umbrella. The goal of organizing these joint workshops is to provide a meeting place for researchers working on language learning issues including both empirical and experimental studies and NLP-based applications.}, author = {Volodina, Elena and Pilán, Ildikó and Borin, Lars and Grigonyte, Gintare and Nilsson Björkenstam, Kristina}, year = {2017}, publisher = {Linköping University Press}, address = {Linköping, Sweden}, ISBN = { 978-91-7685-502-7}, } @techreport{hammarstedt-etal-2017-korp-256055, title = {Korp 6 - Technical Report}, author = {Hammarstedt, Martin and Roxendal, Johan and Öhrman, Maria and Borin, Lars and Forsberg, Markus and Schumacher, Anne}, year = {2017}, publisher = {Institutionen för svenska språket, Göteborgs universitet}, } @techreport{hammarstedt-etal-2017-korp-256056, title = {Korp 6 - Användarmanual}, author = {Hammarstedt, Martin and Borin, Lars and Forsberg, Markus and Roxendal, Johan and Schumacher, Anne and Öhrman, Maria}, year = {2017}, publisher = {Institutionen för svenska språket, Göteborgs universitet}, } @inProceedings{borin-etal-2016-sparv-246053, title = {Sparv: Språkbanken’s corpus annotation pipeline infrastructure}, abstract = {Sparv is Språkbanken's corpus annotation pipeline infrastructure. The easiest way to use the pipeline is from its web interface with a plain text document. The pipeline uses in-house and external tools on the text to segment it into sentences and paragraphs, tokenise, tag parts-of-speech, look up in dictionaries and analyse compounds. The pipeline can also be run using a web API with XML results, and it is run locally at Språkbanken to prepare the documents in Korp, our corpus search tool. While the most sophisticated support is for modern Swedish, the pipeline supports 15 languages.}, booktitle = {SLTC 2016. The Sixth Swedish Language Technology Conference, Umeå University, 17-18 November, 2016}, author = {Borin, Lars and Forsberg, Markus and Hammarstedt, Martin and Rosén, Dan and Schäfer, Roland and Schumacher, Anne}, year = {2016}, } @inProceedings{cap-etal-2016-sword-254388, title = {SWORD: Towards Cutting-Edge Swedish Word Processing}, abstract = {Despite many years of research on Swedish language technology, there is still no well-documented standard for Swedish word processing covering the whole spectrum from low-level tokenization to morphological analysis and disambiguation. SWORD is a new initiative within the SWE-CLARIN consortium aiming to develop documented standards for Swedish word processing. In this paper, we report on a pilot study of Swedish tokenization, where we compare the output of six different tokenizers on four different text types. For one text type (Wikipedia articles), we also compare to the tokenization produced by six manual annotators.}, booktitle = {Proceedings of the Sixth Swedish Language Technology Conference (SLTC) Umeå University, 17-18 November, 2016}, author = {Cap, Fabienne and Adesam, Yvonne and Ahrenberg, Lars and Borin, Lars and Bouma, Gerlof and Forsberg, Markus and Kann, Viggo and Östling, Robert and Smith, Aaron and Wirén, Mats and Nivre, Joakim}, year = {2016}, } @inProceedings{borin-etal-2016-towards-253952, title = {Towards a Big Data View on South Asian Linguistic Diversity}, abstract = {South Asia with its rich and diverse linguistic tapestry of hundreds of languages, including many from four major language families, and a long history of intensive language contact, provides rich empirical data for studies of linguistic genealogy, linguistic typology, and language contact. South Asia is often referred to as a linguistic area, a region where, due to close contact and widespread multilingualism, languages have influenced one another to the extent that both related and unrelated languages are more similar on many linguistic levels than we would expect. However, with some rare exceptions, most studies are largely impressionistic, drawing examples from a few languages. In this paper we present our ongoing work aiming at turning the linguistic material available in Grierson’s Linguistic Survey of India (LSI) into a digital language resource, a database suitable for a broad array of linguistic investigations of the languages of South Asia. In addition to this, we aim to contribute to the methodological development of large-scale comparative linguistics drawing on digital language resources, by exploring NLP techniques for extracting linguistic information from free-text language descriptions of the kind found in the LSI.}, booktitle = {WILDRE-3 – 3rd Workshop on Indian Language Data: Resources and Evaluation}, author = {Borin, Lars and Virk, Shafqat and Saxena, Anju}, year = {2016}, publisher = {ELRA}, address = {Paris}, } @inProceedings{r?dveneide-etal-2016-swedish-250073, title = {The Swedish Culturomics Gigaword Corpus: A One Billion Word Swedish Reference Dataset for NLP}, abstract = {In this paper we present a dataset of contemporary Swedish containing one billion words. The dataset consists of a wide range of sources, all annotated using a state-of-the-art corpus annotation pipeline, and is intended to be a static and clearly versioned dataset. This will facilitate reproducibility of experiments across institutions and make it easier to compare NLP algorithms on contemporary Swedish. The dataset contains sentences from 1950 to 2015 and has been carefully designed to feature a good mix of genres balanced over each included decade. The sources include literary, journalistic, academic and legal texts, as well as blogs and web forum entries.}, booktitle = {Linköping Electronic Conference Proceedings. Digital Humanities 2016. From Digitization to Knowledge 2016: Resources and Methods for Semantic Processing of Digital Works/Texts, July 11, 2016, Krakow, Poland}, author = {Rødven-Eide, Stian and Tahmasebi, Nina and Borin, Lars}, year = {2016}, publisher = {Linköping University Electronic Press}, address = {Linköping}, ISBN = {978-91-7685-733-5}, } @misc{volodina-etal-2016-preface-248087, title = {Preface. Proceedings of the joint workshop on NLP for Computer Assisted Language Learning and NLP for Language Acquisition at SLTC, Umeå, 16th November 2016}, abstract = {The joint workshop on Natural Language Processing (NLP) for Computer-Assisted Language Learning (CALL) & NLP for Language Acquisition (LA) – shorthand NLP4CALL&LA – is an effort to provide a debate space and collaboration between two closely related areas. Both focus on language acquisition, related resources and technologies, that can support research of the language learning process as well as aim to bring interdisciplinary advantage to the field. Individual workshop areas are outlined below. The area of NLP4CALL is applied in essence, where tools, algorithms, and ready-to-use programs play an important role. It has a traditional focus on second or foreign language learning, and the target age group of school children or older. The intersection of Natural Language Processing and Speech Technology, with Computer-Assisted Language Learning (CALL) brings “understanding” of language to CALL tools, thus making CALL intelligent. This fact has provided the name for this area of research – Intelligent CALL, ICALL. As the definition suggests, apart from having excellent knowledge of Natural Language Processing and/or Speech Technology, ICALL researchers need good insights into second language acquisition (SLA) theories and practices, second language assessment, as well as knowledge of L2 pedagogy and didactics. The workshop on Language Processing for Research in Language Acquisition (NLP4LA) broadens the scope of the joint workshop to also include theoretical, empirical, and experimental investigation of first, second and bilingual language acquisition. NLP4LA aims to foster collaboration between the NLP, linguistics, psychology and cognitive science communities. The workshop is targeted at anyone interested in the relevance of computational techniques for first, second and bilingual language acquisition. The joint workshop series on NLP4CALL&LA has arisen in 2016 and has become a meeting place for researchers working on the integration of Natural Language Processing and Speech Technologies in systems supporting language learning and research around it, and exploring the theoretical and methodological issues arising during language acquisition. }, author = {Volodina, Elena and Grigonytė, Gintarė and Pilán, Ildikó and Nilsson Björkenstam, Kristina and Borin, Lars}, year = {2016}, number = {130}, pages = { i–viii}, } @inProceedings{borin-etal-2014-bring-198549, title = {Bring vs. MTRoget: Evaluating automatic thesaurus translation}, booktitle = {Proceedings of LREC 2014, May 26-31, 2014 Reykjavik, Iceland}, author = {Borin, Lars and Allwood, Jens and de Melo, Gerard}, year = {2014}, publisher = {European Language Resources Association}, ISBN = {978-2-9517408-8-4}, } @misc{volodina-etal-2016-proceedings-248081, title = {Proceedings of the joint workshop on NLP for Computer Assisted Language Learning and NLP for Language Acquisition at SLTC, Umeå, 16th November 2016}, abstract = {The joint workshop on Natural Language Processing (NLP) for Computer-Assisted Language Learning (CALL) & NLP for Language Acquisition (LA) – shorthand NLP4CALL&LA – is an effort to provide a debate space and collaboration between two closely related areas. Both focus on language acquisition, related resources and technologies, that can support research of the language learning process as well as aim to bring interdisciplinary advantage to the field. Individual workshop areas are outlined below. The area of NLP4CALL is applied in essence, where tools, algorithms, and ready-to-use programs play an important role. It has a traditional focus on second or foreign language learning, and the target age group of school children or older. The intersection of Natural Language Processing and Speech Technology, with Computer-Assisted Language Learning (CALL) brings “understanding” of language to CALL tools, thus making CALL intelligent. This fact has provided the name for this area of research – Intelligent CALL, ICALL. As the definition suggests, apart from having excellent knowledge of Natural Language Processing and/or Speech Technology, ICALL researchers need good insights into second language acquisition (SLA) theories and practices, second language assessment, as well as knowledge of L2 pedagogy and didactics. The workshop on Language Processing for Research in Language Acquisition (NLP4LA) broadens the scope of the joint workshop to also include theoretical, empirical, and experimental investigation of first, second and bilingual language acquisition. NLP4LA aims to foster collaboration between the NLP, linguistics, psychology and cognitive science communities. The workshop is targeted at anyone interested in the relevance of computational techniques for first, second and bilingual language acquisition. The joint workshop series on NLP4CALL&LA has arisen in 2016 and has become a meeting place for researchers working on the integration of Natural Language Processing and Speech Technologies in systems supporting language learning and research around it, and exploring the theoretical and methodological issues arising during language acquisition.}, author = {Volodina, Elena and Grigonytė, Gintarė and Pilán, Ildikó and Nilsson Björkenstam, Kristina and Borin, Lars}, year = {2016}, publisher = {Linköping University Electronic Press}, address = {Linköping}, ISBN = {978-91-7685-633-8}, } @incollection{borin-2016-lexikografi-246607, title = {Lexikografi för maskiner och lexikografi för människor}, booktitle = {Framtidens lexikografi: Rapport från ett symposium i Göteborg 5 oktober 2012}, author = {Borin, Lars}, year = {2016}, publisher = {Meijerbergs institut vid Göteborgs universitet}, address = {Göteborg}, ISBN = {978-91-87850-01-1}, pages = {9--27}, } @article{tahmasebi-etal-2015-visions-212969, title = {Visions and open challenges for a knowledge-based culturomics}, abstract = {The concept of culturomics was born out of the availability of massive amounts of textual data and the interest to make sense of cultural and language phenomena over time. Thus far however, culturomics has only made use of, and shown the great potential of, statistical methods. In this paper, we present a vision for a knowledge-based culturomics that complements traditional culturomics. We discuss the possibilities and challenges of combining knowledge-based methods with statistical methods and address major challenges that arise due to the nature of the data; diversity of sources, changes in language over time as well as temporal dynamics of information in general. We address all layers needed for knowledge-based culturomics, from natural language processing and relations to summaries and opinions.}, journal = {International Journal on Digital Libraries}, author = {Tahmasebi, Nina and Borin, Lars and Capannini, Gabriele and Dubhashi, Devdatt and Exner, Peter and Forsberg, Markus and Gossen, Gerhard and Johansson, Fredrik and Johansson, Richard and Kågebäck, Mikael and Mogren, Olof and Nugues, Pierre and Risse, Thomas}, year = {2015}, volume = {15}, number = {2-4}, pages = {169--187}, } @inProceedings{borin-kosinski-2016-towards-238147, title = {Towards interactive visualization of public discourse in time and space}, abstract = {We report on a proof-of-concept study where we (1) apply NLP tools for extracting political-discourse topics from a large Swedish Twitter dataset; and (2) design an interactive spatiotemporal visualization application allowing humanities and social-science scholars to explore how the tweet topics vary over space and time.}, booktitle = {Linköping Electronic Conference Proceedings}, author = {Borin, Lars and Kosiński, Tomasz}, year = {2016}, volume = {126}, ISBN = {978-91-7685-733-5}, pages = {1--7}, } @inProceedings{borin-etal-2013-mining-188846, title = {Mining semantics for culturomics: towards a knowledge-based approach}, abstract = {The massive amounts of text data made available through the Google Books digitization project have inspired a new field of big-data textual research. Named culturomics, this field has attracted the attention of a growing number of scholars over recent years. However, initial studies based on these data have been criticized for not referring to relevant work in linguistics and language technology. This paper provides some ideas, thoughts and first steps towards a new culturomics initiative, based this time on Swedish data, which pursues a more knowledge-based approach than previous work in this emerging field. The amount of new Swedish text produced daily and older texts being digitized in cultural heritage projects grows at an accelerating rate. These volumes of text being available in digital form have grown far beyond the capacity of human readers, leaving automated semantic processing of the texts as the only realistic option for accessing and using the information contained in them. The aim of our recently initiated research program is to advance the state of the art in language technology resources and methods for semantic processing of Big Swedish text and focus on the theoretical and methodological advancement of the state of the art in extracting and correlating information from large volumes of Swedish text using a combination of knowledge-based and statistical methods.}, booktitle = {2013 ACM International Workshop on Mining Unstructured Big Data Using Natural Language Processing, UnstructureNLP 2013, Held at 22nd ACM International Conference on Information and Knowledge Management, CIKM 2013; San Francisco, CA; United States; 28 October 2013 through 28 October 2013}, author = {Borin, Lars and Dubhashi, Devdatt and Forsberg, Markus and Johansson, Richard and Kokkinakis, Dimitrios and Nugues, Pierre}, year = {2013}, ISBN = {978-1-4503-2415-1}, pages = {3--10}, } @inProceedings{viklund-borin-2016-data-236738, title = {How can big data help us study rhetorical history?}, abstract = {Rhetorical history is traditionally studied through rhetorical treatises or selected rhetorical practices, for example the speeches of major orators. Although valuable sources, these do not give us the answers to all our questions. Indeed, focus on a few canonical works or the major historical key figures might even lead us to reproduce cultural self-identifications and false generalizations. However, thanks to increasing availability of relevant digitized texts, we are now at a point where it is possible to see how new research questions can be formulated – and how old research questions can be addressed from a new angle or established results verified – on the basis of exhaustive collections of data, rather than small samples, but where a methodology has not yet established itself. The aim of this paper is twofold: (1) We wish to demonstrate the usefulness of large-scale corpus studies (“text mining”) in the field of rhetorical history, and hopefully point to some interesting research problems and how they can be analyzed using “big-data” methods. (2) In doing this, we also aim to make a contribution to method development in e-science for the humanities and social sciences, and in particular in the framework of CLARIN. }, booktitle = {Linköping Electronic Conference Proceedings, No. 123. Edited by Koenraad De Smedt. Selected Papers from the CLARIN Annual Conference 2015. October 14–16, 2015, Wroclaw, Poland}, author = {Viklund, Jon and Borin, Lars}, year = {2016}, volume = {123}, ISBN = {978-91-7685-765-6}, pages = {79--93}, } @article{adesam-etal-2016-sprakteknologi-237884, title = {Språkteknologi för svenska språket genom tiderna}, abstract = {Språkbanken, the Swedish Language Bank, is a language technology research unit at the Department of Swedish, University of Gothenburg. We develop language resources – such as corpora, lexical resources, and analytical tools – for all variants of Swedish, from Old Swedish laws to present-day social media. Historical texts offer exciting theoretical and methodological challenges for language technology because they often defy the assumption inherent in most automatic analysis tools that the texts contain a standardized written language. In this article, we describe our ongoing work on the development of annotated historical corpora, as well as our efforts on linking various resources (both corpora and lexical resources). This research advances the state of the art of language technology as well as enables new research for scholars in other disciplines.}, journal = {Kungliga Skytteanska Samfundets Handlingar}, author = {Adesam, Yvonne and Ahlberg, Malin and Andersson, Peter and Borin, Lars and Bouma, Gerlof and Forsberg, Markus}, year = {2016}, volume = {76}, number = {Studier i svensk språkhistoria 13}, pages = {65--87}, } @inProceedings{tahmasebi-etal-2016-clarin-233899, title = {SWE-CLARIN – the Swedish CLARIN project – aims and activities}, booktitle = {Digital Humanities in the Nordic countries, Oslo, March 15-17 2016}, author = {Tahmasebi, Nina and Borin, Lars and Jordan, Caspar and Ekman, Stefan}, year = {2016}, pages = {122--123}, } @techreport{borin-etal-2016-free-233768, title = {A free cloud service for OCR / En fri molntjänst för OCR}, author = {Borin, Lars and Bouma, Gerlof and Dannélls, Dana}, year = {2016}, publisher = {University of Gothenburg}, address = {Göteborg}, } @edited_book{volodina-etal-2015-proceedings-226574, title = {Proceedings of the 4th workshop on NLP for computer assisted language learning at Nodalida 2015, Vilnius, 11th May, 2015}, editor = {Volodina, Elena and Borin, Lars and Pilán, Ildikó}, year = {2015}, publisher = {Linköping University Press}, address = {Linköping}, ISBN = {978-91-7519-036-5}, } @article{forsberg-etal-2014-from-208123, title = {From construction candidates to constructicon entries: An experiment using semi-automatic methods for identifying constructions in corpora}, abstract = { We present an experiment where natural language processing tools are used to automatically identify potential constructions in a corpus. e experiment was conducted as part of the ongoing efforts to develop a Swedish constructicon. Using an automatic method to suggest constructions has advantages not only for efficiency but also methodologically: it forces the analyst to look more objec-tively at the constructions actually occurring in corpora, as opposed to focusing on “interesting” constructions only. As a heuristic for identifying potential con-structions, the method has proved successful, yielding about 200 (out of 1,200) highly relevant construction candidates.}, journal = {Constructions and Frames}, author = {Forsberg, Markus and Johansson, Richard and Bäckström, Linnéa and Borin, Lars and Lyngfelt, Benjamin and Olofsson, Joel and Prentice, Julia}, year = {2014}, volume = {6}, number = {1, 2014}, pages = {114--135}, } @inProceedings{borin-etal-2015-here-217351, title = {Here be dragons? The perils and promises of inter-resource lexical-semantic mapping}, abstract = {Lexical-semantic knowledges sources are a stock item in the language technologist’s toolbox, having proved their practical worth in many and diverse natural language processing (NLP) applications. In linguistics, lexical semantics comes in many flavors, but in the NLP world, wordnets reign more or less supreme. There has been some promising work utilizing Roget-style thesauruses instead, but wider experimentation is hampered by the limited availability of such resources. The work presented here is a first step in the direction of creating a freely available Roget-style lexical resource for modern Swedish. Here, we explore methods for automatic disambiguation of interresource mappings with the longer-term goal of utilizing similar techniques for automatic enrichment of lexical-semantic resources.}, booktitle = {Linköping Electronic Conference Proceedings. Semantic resources and semantic annotation for Natural Language Processing and the Digital Humanities. Workshop at NODALIDA , May 11, 13-18 2015, Vilnius}, author = {Borin, Lars and Nieto Piña, Luis and Johansson, Richard}, year = {2015}, volume = {112}, ISBN = {978-91-7519-049-5}, pages = {1--11}, } @incollection{ribeck-borin-2014-lexical-201965, title = {Lexical Bundles in Swedish Secondary School Textbooks}, abstract = {The present paper describes the process of identifying lexical bundles, i.e., frequently recurring word sequences such as by means of and in the end of, in secondary school history and physics textbooks. In its determination of finding genuine lexical bundles, i.e. the word boundaries between lexical bundles and surrounding arbitrary words, it proposes a new approach to come to terms with the problem of extracting overlapping bundles of different lengths. The results of the structural classification indicate that history uses more NP/PP-based and less dependent-clause-based bundles than physics. The comparative analysis manages to restrict this difference to the referential function. History almost only refers to phrases, i.e. within clauses, while physics much more tends to make references across clauses. The article also includes a report on an extension of the study, ongoing work where the automatic identification of multi-word expressions in general is in focus.}, booktitle = {Human Language Technology Challenges for Computer Science and Linguistics 5th Language and Technology Conference, LTC 2011, Poznań, Poland, November 25--27, 2011, Revised Selected Papers}, editor = {Zygmunt Vetulani and Joseph Mariani.}, author = {Ribeck, Judy Carola and Borin, Lars}, year = {2014}, publisher = {Springer International Publishing}, volume = {2014}, number = {XVI}, address = {Cham}, ISBN = {978-3-319-08958-4}, pages = {238--249}, } @inProceedings{ahlberg-etal-2014-swedish-210083, title = {Swedish FrameNet++ The Beginning of the End and the End of the Beginning}, booktitle = {Proceedings of the Fifth Swedish Language Technology Conference, Uppsala, 13-14 November 2014}, author = {Ahlberg, Malin and Borin, Lars and Dannélls, Dana and Forsberg, Markus and Toporowska Gronostaj, Maria and Friberg Heppin, Karin and Johansson, Richard and Kokkinakis, Dimitrios and Olsson, Leif-Jöran and Uppström, Jonatan}, year = {2014}, } @inProceedings{kokkinakis-etal-2014-hfst-209800, title = {HFST-SweNER . A New NER Resource for Swedish}, abstract = {Named entity recognition (NER) is a knowledge-intensive information extraction task that is used for recognizing textual mentions of entities that belong to a predefined set of categories, such as locations, organizations and time expressions. NER is a challenging, difficult, yet essential preprocessing technology for many natural language processing applications, and particularly crucial for language understanding. NER has been actively explored in academia and in industry especially during the last years due to the advent of social media data. This paper describes the conversion, modeling and adaptation of a Swedish NER system from a hybrid environment, with integrated functionality from various processing components, to the Helsinki Finite-State Transducer Technology (HFST) platform. This new HFST-based NER (HFST-SweNER) is a full-fledged open source implementation that supports a variety of generic named entity types and consists of multiple, reusable resource layers, e.g., various n-gram-based named entity lists (gazetteers).}, booktitle = {Proceedings of the 9th edition of the Language Resources and Evaluation Conference (LREC), Reykjavik 26 - 31 May 2014.}, author = {Kokkinakis, Dimitrios and Niemi, Jyrki and hardwick, sam and Lindén, Krister and Borin, Lars}, year = {2014}, ISBN = {978-2-9517408-8-4}, pages = {2537--2543}, } @inProceedings{adesam-etal-2014-koala-211376, title = {Koala – Korp’s Linguistic Annotations Developing an infrastructure for text-based research with high-quality annotations}, booktitle = {Proceedings of the Fifth Swedish Language Technology Conference, Uppsala, 13-14 November 2014}, author = {Adesam, Yvonne and Borin, Lars and Bouma, Gerlof and Forsberg, Markus and Johansson, Richard}, year = {2014}, } @incollection{rama-borin-2015-comparative-197484, title = {Comparative evaluation of string similarity measures for automatic language classification.}, booktitle = {Sequences in Language and Text}, author = {Rama, Taraka and Borin, Lars}, year = {2015}, publisher = {De Gruyter Mouton}, ISBN = {978-3-11-036287-9}, } @article{borin-etal-2014-geographic-198286, title = {Geographic visualization of place names in Swedish literary texts}, abstract = {This article describes the development of a geographical information system (GIS) at Språkbanken as part of a visualization solution to be used in an archive of historical Swedish literary texts. The research problems we are aiming to address concern orthographic and morphological variation, missing place names, and missing place name coordinates. Some of these problems form a central part in the development of methods and tools for the automatic analysis of historical Swedish literary texts at our research unit. We discuss the advantages and challenges of covering large-scale spelling variation in place names from different sources and in generating maps with focus on different time periods. }, journal = {Literary & Linguistic Computing}, author = {Borin, Lars and Dannélls, Dana and Olsson, Leif-Jöran}, year = {2014}, volume = {29}, number = {3}, pages = {400--404}, } @inProceedings{lyngfelt-etal-2014-svenskt-208457, title = {Ett svenskt konstruktikon. Grammatik möter lexikon}, booktitle = {Svenskans beskrivning : Förhandlingar vid Trettiotredje sammankomsten för svenskans beskrivning. Helsingfors den 15–17 maj 2013}, author = {Lyngfelt, Benjamin and Borin, Lars and Bäckström, Linnéa and Forsberg, Markus and Olsson, Leif-Jöran and Prentice, Julia and Rydstedt, Rudolf and Sköldberg, Emma and Tingsell, Sofia and Uppström, Jonatan}, year = {2014}, volume = {33}, ISBN = {978-951-51-0120-4}, pages = {268--279}, } @article{borin-etal-2014-introduction-202127, title = {Introduction: Constructions and frames meet language technology}, journal = {Constructions and Frames}, author = {Borin, Lars and de Melo, Gerard and Friberg Heppin, Karin and Torrent, Tiago Timponi}, year = {2014}, volume = {6}, number = {1}, pages = {1--8}, } @edited_book{volodina-etal-2014-proceedings-206135, title = {Proceedings of the third workshop on NLP for computer-assisted language learning at SLTC 2014, Uppsala University}, abstract = {The workshop series on NLP for Computer-Assisted Language Learning (NLP4CALL) is a meeting place for researchers working on the integration of Natural Language Processing and Speech technologies in CALL systems and exploring the theoretical and methodological issues arising in this connection. The papers in the proceedings volume from the third NLP4CALL workshop cover three main topic areas: resources for development of ICALL applications (e.g., learner corpora and coursebook corpora), tools and algorithms for the analysis of learner language (e.g., focusing on collocations, reading tasks, cloze items, pronunciation, spelling, level classification of learner production), and the generation of learning materials (e.g., exercise generators).}, editor = {Volodina, Elena and Borin, Lars and Pilán, Ildikó}, year = {2014}, publisher = {Linköping University Press}, address = {Linköping}, ISBN = {978-91-7519-175-1}, } @inProceedings{borin-etal-2014-representing-204731, title = {Representing Swedish Lexical Resources in RDF with lemon}, abstract = {The paper presents an ongoing project which aims to publish Swedish lexical-semantic resources using Semantic Web and Linked Data technologies. In this article, we highlight the practical conversion methods and challenges of converting three of the Swedish language resources in RDF with lemon.}, booktitle = { Proceedings of the ISWC 2014 Posters & Demonstrations Track a track within the 13th International Semantic Web Conference (ISWC 2014)}, author = {Borin, Lars and Dannélls, Dana and Forsberg, Markus and McCrae, John P.}, year = {2014}, volume = {1272 }, pages = {329--332}, } @inProceedings{volodina-etal-2014-flexible-201885, title = {A flexible language learning platform based on language resources and web services. }, abstract = {We present Lärka, the language learning platform of Språkbanken (the Swedish Language Bank). It consists of an exercise generator which reuses resources available through Språkbanken: mainly Korp, the corpus infrastructure, and Karp, the lexical infrastructure. Through Lärka we reach new user groups – students and teachers of Linguistics as well as second language learners and their teachers – and this way bring Språkbanken's resources in a relevant format to them. Lärka can therefore be viewed as a case of a real-life language resource evaluation with end users. In this article we describe Lärka's architecture, its user interface, and the five exercise types that have been released for users so far. The first user evaluation following in-class usage with students of linguistics, speech therapy and teacher candidates are presented. The outline of future work concludes the paper.}, booktitle = {Proceedings of LREC 26-31 May 2014, Reykjavik, Iceland }, author = {Volodina, Elena and Pilán, Ildikó and Borin, Lars and Tiedemann, Therese Lindström}, year = {2014}, ISBN = {978-2-9517408-8-4}, pages = {3973--3978}, } @inProceedings{rehm-etal-2014-strategic-198556, title = {The strategic impact of META-NET on the regional, national and international level}, booktitle = {Proceedings of LREC 2014, 26-31 May, Reykjavik, Iceland }, author = {Rehm, Georg and Uszkoreit, Hans and Ananiadou, Sophia and Bel, Núria and Bieleviciene, Audrone and Borin, Lars and Branco, António and Budin, Gerhard and Calzolari, Nicoletta and Daelemans, Walter and Garabík, Radovan and Grobelnik, Marko and Garcia-Mateo, Carmen and Genabith, Josef Van and Hajic, Jan and Hernaez, Inma and Judge, John and Koeva, Svetla and Krek, Simon and Krstev, Cvetana and Lindén, Krister and Magnini, Bernardo and Mariani, Joseph and Mcnaught, John and Melero, Maite and Monachini, Monica and Moreno, Asuncion and Odijk, Jan and Ogrodniczuk, Maciej and Pezik, Piotr and Piperidis, Stelios and Przepiórkowski, Adam and Rögnvaldsson, Eiríkur and Rosner, Michael and Pedersen, Bolette Sandford and Skadina, Inguna and De Smedt, Koenraad and Tadić, Marko and Thompson, Paul and Tufiș, Dan and Váradi, Tamás and Vasiljevs, Andrejs and Vider, Kadri and Zabarskaite, Jolanta}, year = {2014}, ISBN = {978-2-9517408-8-4}, pages = {1517--1524}, } @inProceedings{borin-etal-2014-linguistic-198551, title = {Linguistic landscaping of South Asia using digital language resources: Genetic vs. areal linguistics}, booktitle = {Proceedings of LREC, May 26-31, 2014, Reykjavik, Iceland}, author = {Borin, Lars and Saxena, Anju and Rama, Taraka and Comrie, Bernard}, year = {2014}, ISBN = {978-2-9517408-8-4}, pages = {3137--3144}, } @inProceedings{skadina-etal-2013-baltic-194532, title = {Baltic and Nordic parts of the European linguistic infrastructure}, booktitle = {71. Proceedings of the 19th Nordic Conference of Computational Linguistics (NODALIDA 2013) 22-24, May 2013 Oslo, Norway}, author = {Skadina, Inguna and Vasiljevs, Andrejs and Borin, Lars and Lindén, Krister and Losnegaard, Gyri and Pedersen, Bolette Sandford and Rozis, Roberts and De Smedt, Koenraad}, year = {2013}, ISBN = {978-91-7519-589-6}, pages = {195--211}, } @article{borin-etal-2013-saldo-188604, title = {SALDO: a touch of yin to WordNet's yang}, abstract = {The English-language Princeton WordNet (PWN) and some wordnets for other languages have been extensively used as lexical–semantic knowledge sources in language technology applications, due to their free availability and their size. The ubiquitousness of PWN-type wordnets tends to overshadow the fact that they represent one out of many possible choices for structuring a lexical-semantic resource, and it could be enlightening to look at a differently structured resource both from the point of view of theoretical–methodological considerations and from the point of view of practical text processing requirements. The resource described here—SALDO—is such a lexical–semantic resource, intended primarily for use in language technology applications, and offering an alternative organization to PWN- style wordnets. We present our work on SALDO, compare it with PWN, and discuss some implications of the differences. We also describe an integrated infrastructure for computational lexical resources where SALDO forms the central component.}, journal = {Language resources and evaluation}, author = {Borin, Lars and Forsberg, Markus and Lönngren, Lennart}, year = {2013}, volume = {47}, number = {4}, pages = {1191--1211}, } @inProceedings{borin-forsberg-2014-swesaurus;-193085, title = {Swesaurus; or, The Frankenstein Approach to Wordnet Construction}, abstract = {Swesaurus is a freely available (under a CC-BY license) Swedish wordnet under construction, built primarily by scavenging and recycling information from a number of existing lexical resources. Among its more unusual characteristics are graded lexical-semantic relations and inclusion of all parts of speech, not only open-class items. }, booktitle = {Proceedings of the Seventh Global WordNet Conference (GWC 2014)}, author = {Borin, Lars and Forsberg, Markus}, year = {2014}, ISBN = {978-9949-32-492-7}, } @article{rama-borin-2014-gram-187121, title = {N-Gram Approaches to the Historical Dynamics of Basic Vocabulary}, journal = {Journal of Quantitative Linguistics}, author = {Rama, Taraka and Borin, Lars}, year = {2014}, volume = {21}, number = {1}, pages = {50--64}, } @inProceedings{skoldberg-etal-2013-between-186041, title = {Between Grammars and Dictionaries: a Swedish Constructicon }, abstract = {This paper introduces the Swedish Constructicon (SweCxn), a database of Swedish constructions currently under development. We also present a small study of the treatment of constructions in Swedish (paper) dictionaries, thus illustrating the need for a constructionist approach, and discuss three different methods used to identify potential constructions for inclusion in the constructicon. SweCxn is a freely available electronic resource, with a particular focus on semi-general linguistic patterns of the type that are difficult to account for from a purely lexicographic or a purely grammatical perspective, and which therefore have tended to be neglected in both dictionaries and grammars. Far from being a small set of borderline cases, such constructions are both numerous and common. They are also quite problematic for second language acquisition as well as LT applications. Accordingly, various kinds of multi-word units have received more attention in recent years, not least from a lexicographic perspective. The coverage, however, is only partial, and the productivity of many constructions is hard to capture from a lexical viewpoint. To identify constructions for SweCxn, we use a combination of methods, such as working from existing construction descriptions for Swedish and other languages, applying LT tools to discover recurring patterns in texts, and extrapolating constructional information from dictionaries. }, booktitle = {Kosem, I., Kallas, J., Gantar, P., Krek, S., Langemets, M., Tuulik, M. (eds.) 2013. Electronic lexicography in the 21st century: thinking outside the paper. Proceedings of the eLex 2013 conference, 17-19 October 2013, Tallinn, Estonia. Ljubljana/Tallinn: Trojina, Institute for Applied Slovene Studies/Eesti Keele Instituut.}, author = {Sköldberg, Emma and Bäckström, Linnéa and Borin, Lars and Forsberg, Markus and Lyngfelt, Benjamin and Olsson, Leif-Jöran and Prentice, Julia and Rydstedt, Rudolf and Tingsell, Sofia and Uppström, Jonatan}, year = {2013}, pages = {310--327}, } @article{borin-johansson-2014-kulturomik-192931, title = {Kulturomik: Att spana efter språkliga och kulturella förändringar i digitala textarkiv}, journal = {Historia i en digital värld}, author = {Borin, Lars and Johansson, Richard}, year = {2014}, } @edited_book{borin-volodina-2012-proceedings-188679, title = {Proceedings of the SLTC 2012 workshop on NLP for CALL}, editor = {Borin, Lars and Volodina, Elena}, year = {2012}, publisher = {LiU Electronic Press}, address = {Linköping}, } @edited_book{desmedt-etal-2013-proceedings-190263, title = {Proceedings of the workshop on Nordic language research infrastructure at NODALIDA 2013, May 22-24, 2013, Oslo, Norway}, editor = {De Smedt, Koenrad and Borin, Lars and Lindén, Krister and Maegaard, Bente and Rögnvaldsson, Eiríkur and Vider, Kadri}, year = {2013}, publisher = {Linköping University Electronic Press}, address = {Linköping}, ISBN = {978-91-7519-585-8}, } @edited_book{borin-etal-2013-proceedings-190260, title = {Proceedings of the workshop on lexical semantic resources for NLP at NODALIDA 2013, May 22-24, 2013, Oslo, Norway}, editor = {Borin, Lars and Fjeld, Ruth Vatvedt and Forsberg, Markus and Nimb, Sanni and Nugues, Pierre and Pedersen, Bolette Sandford}, year = {2013}, publisher = {Linköping University Electronic Press}, address = {Linköping}, ISBN = {978-91-7519-586-5}, } @edited_book{ey?orsson-etal-2013-proceedings-190256, title = {Proceedings of the workshop on computational historical linguistics at NODALIDA 2013, May 22-24, 2013, Oslo, Norway}, editor = {Eyþórsson, Þórhallur and Borin, Lars and Haug, Dag and Rögnvaldsson, Eiríkur}, year = {2013}, publisher = {Linköping University Electronic Press}, address = {Linköping}, ISBN = {978-91-7519-587-2}, } @edited_book{volodina-etal-2013-proceedings-188675, title = {Proceedings of the second workshop on NLP for computer-assisted language learning at NODALIDA 2013 May 22-24, 2013, Oslo, Norway}, editor = {Volodina, Elena and Borin, Lars and Loftsson, Hrafn}, year = {2013}, publisher = {Linköping University Press}, address = {Linköping, Sweden}, ISBN = {978-91-7519-588-9}, } @inProceedings{borin-etal-2012-open-156079, title = {The open lexical infrastructure of Språkbanken}, abstract = {We present our ongoing work on Karp, Språkbanken’s (the Swedish Language Bank) open lexical infrastructure, which has two main functions: (1) to support the work on creating, curating, and integrating our various lexical resources; and (2) to publish daily versions of the resources, making them searchable and downloadable. An important requirement on the lexical infrastructure is also that we maintain a strong bidirectional connection to our corpus infrastructure. At the heart of the infrastructure is the SweFN++ project with the goal to create free Swedish lexical resources geared towards language technology applications. The infrastructure currently hosts 15 Swedish lexical resources, including historical ones, some of which have been created from scratch using existing free resources, both external and in-house. The resources are integrated through links to a pivot lexical resource, SALDO, a large morphological and lexical-semantic resource for modern Swedish. SALDO has been selected as the pivot partly because of its size and quality, but also because its form and sense units have been assigned persistent identifiers (PIDs) to which the lexical information in other lexical resources and in corpora are linked.}, booktitle = {Proceedings of the 8th International Conference on Language Resources and Evaluation : May 23-25, 2012 / eds. Nicoletta Calzolari }, author = {Borin, Lars and Forsberg, Markus and Olsson, Leif-Jöran and Uppström, Jonatan}, year = {2012}, ISBN = {978-2-9517408-7-7}, pages = {3598--3602}, } @article{borin-etal-2013-close-187063, title = {Close encounters of the fifth kind: Some linguistic and computational aspects of the Swedish FrameNet++ project}, abstract = {The Swedish FrameNet++ (SweFN++) project aims at developing an integrated Swedish lexical macro-resource to be used primarily in language technology R&D to build natural language processing (NLP) applications. Most of the component resources making up SweFN++ are existing digital lexical resources; in their case the central project effort is directed at making them interoperable on as many levels as possible. An important new resource being created in the project is a Swedish framenet. Now a sister project is starting with the aim of adding a Swedish constructicon (SweCxn) to the macro-resource. In this paper, we discuss some theoretical and conceptual issues which have arisen in the course of our work on the SweFN++ and the planning of the SweCxn, in the close encounter between the practical requirements of NLP and the theory and practice of linguistic – lexical and grammatical – description. }, journal = {Veredas}, author = {Borin, Lars and Forsberg, Markus and Lyngfelt, Benjamin}, year = {2013}, volume = {17}, number = {1}, pages = {28--43}, } @incollection{borin-etal-2013-intercontinental-184760, title = {The Intercontinental Dictionary Series – a rich and principled database for language comparison}, booktitle = {Approaches to Measuring Linguistic Differences}, editor = {Lars Borin ; Anju Saxena}, author = {Borin, Lars and Comrie, Bernard and Saxena, Anju}, year = {2013}, publisher = {De Gruyter Mouton}, address = {Berlin}, ISBN = {978-3-11-030525-8}, pages = {285--302}, } @inProceedings{borin-etal-2012-korp-156080, title = {Korp – the corpus infrastructure of Språkbanken}, abstract = {We present Korp, the corpus infrastructure of Språkbanken (the Swedish Language Bank). The infrastructure consists of three main components: the Korp corpus pipeline, the Korp backend, and the Korp frontend. The Korp corpus pipeline is used for importing corpora, annotating them, and then exporting the annotated corpora into different formats. An essential feature of the pipeline is the ability to leave existing annotations untouched, both structural and word level annotations, and to use the existing annotations as the foundation of other annotations. The Korp backend consists of a set of REST-based web services for searching in and retrieving information about the corpora. Finally, the Korp frontend is a graphical search interface that interacts with the Korp backend. The interface has been inspired by corpus search interfaces such as SketchEngine, Glossa, and DeepDict, and it uses State Chart XML (SCXML) in order to enable users to bookmark interaction states. We give a functional and technical overview of the three components, followed by a discussion of planned future work. }, booktitle = {Proceedings of LREC 2012. Istanbul: ELRA}, author = {Borin, Lars and Forsberg, Markus and Roxendal, Johan}, year = {2012}, volume = {Accepted}, pages = {474–478}, } @incollection{borin-2013-measuring-184758, title = {The why and how of measuring linguistic differences}, booktitle = {Approaches to Measuring Linguistic Differences}, editor = {Lars Borin and Anju Saxena}, author = {Borin, Lars}, year = {2013}, publisher = {De Gruyter Mouton}, address = {Berlin}, ISBN = {978-3-11-030525-8}, pages = {3--26}, } @incollection{saxena-borin-2013-carving-184759, title = {Carving Tibeto-Kanauri by its joints: Using basic vocabulary lists for genetic grouping of languages}, booktitle = {Approaches to Measuring Linguistic Differences}, author = {Saxena, Anju and Borin, Lars}, year = {2013}, publisher = {De Gruyter Mouton}, address = {Berlin}, ISBN = {978-3-11-030525-8}, pages = {175--198}, } @edited_book{borin-saxena-2013-approaches-184757, title = {Approaches to Measuring Linguistic Differences}, abstract = {The present volume collects contributions addressing different aspects of the measurement of linguistic differences, a topic which probably is as old as language itself but at the same time has acquired renewed interest over the last decade or so, reflecting a rapid development of data-intensive computing in all fields of research, including linguistics.}, editor = {Borin, Lars and Saxena, Anju}, year = {2013}, publisher = {De Gruyter Mouton}, address = {Berlin}, ISBN = {978-3-11-030525-8}, } @inProceedings{dannells-etal-2013-mapserver-178095, title = {MapServer for Swedish Language Technology}, abstract = {The MapServer application used by the Swedish Language Bank provides new opportunities for visualizing geographical information found in its large repository of written texts, in particular literary texts. The application is capable of performing coordinate search on the basis of recognized place names and rendering both static and dynamic maps that display their geographical locations. }, booktitle = {Digital Humanities}, author = {Dannélls, Dana and Borin, Lars and Olsson, Leif-Jöran}, year = {2013}, } @inProceedings{pedersen-etal-2013-nordic-178357, title = {Nordic and Baltic wordnets aligned and compared through “WordTies”}, abstract = {During the last few years, extensive wordnets have been built locally for the Nordic and Baltic languages applying very different compilation strategies. The aim of the present investigation is to consolidate and examine these wordnets through an alignment via Princeton Core WordNet and thereby compare them along the measures of taxonomical structure, synonym structure, and assigned relations to approximate to a best practice. A common web interface and visualizer “WordTies” is developed to facilitate this purpose. Four bilingual wordnets are automatically processed and evaluated exposing interesting differences between the wordnets. Even if the alignments are judged to be of a good quality, the precision of the translations vary due to considerable differences in hyponymy depth and interpretation of the synset. All seven monolingual and four bilingual wordnets as well as WordTies have been made available via META-SHARE through the META-NORD project.}, booktitle = {Proceedings of the 19th Nordic Conference of Computational Linguistics (NODALIDA 2013), May 22–24, 2013, Oslo University, Norway. NEALT Proceedings Series 16}, author = {Pedersen, Bolette and Borin, Lars and Forsberg, Markus and Kahusk, Neeme and Lindén, Krister and Niemi, Jyrki and Nisbeth, Niklas and Nygaard, Lars and Orav, Heili and Rögnvaldsson, Eiríkur and Seaton, Mitchel and Vider, Kadri and Voionmaa, Kaarlo}, year = {2013}, number = {16}, pages = {147--162}, } @inProceedings{backstrom-etal-2013-automatic-178351, title = {Automatic identification of construction candidates for a Swedish constructicon}, abstract = {We present an experiment designed for extracting construction candidates for a Swedish constructicon from text corpora. We have explored the use of hybrid n-grams with the practical goal to discover previously undescribed partially schematic constructions. The experiment was successful, in that quite a few new constructions were discovered. The precision is low, but as a push-button tool for construction discovery, it has proven a valuable tool for the work on a Swedish constructicon.}, booktitle = {Proceedings of the workshop on lexical semantic resources for NLP at NODALIDA 2013, May 22-24, 2013, Oslo, Norway. NEALT Proceedings Series 19}, author = {Bäckström, Linnéa and Borin, Lars and Forsberg, Markus and Lyngfelt, Benjamin and Prentice, Julia and Sköldberg, Emma}, year = {2013}, pages = {2--11}, } @inProceedings{borin-etal-2012-growing-171988, title = {Growing a Swedish constructicon in lexical soil}, booktitle = {Proceedings of the Swedish Language Technology Conference. Lund, October 24-26, 2012}, author = {Borin, Lars and Forsberg, Markus and Lyngfelt, Benjamin and Prentice, Julia and Rydstedt, Rudolf and Sköldberg, Emma and Tingsell, Sofia}, year = {2012}, pages = {10--11}, } @inProceedings{volodina-etal-2012-towards-168516, title = {Towards a system architecture for ICALL}, abstract = {In this paper, we present an on-going project whose overall aim is to develop open-source system architecture for supporting ICALL systems that will facilitate re-use of existing NLP tools and resources on a plug-and-play basis. We introduce the project, describe the approaches adopted by the two language teams, and present two applications being developed using the proposed architecture.}, booktitle = {In G. Biswas et al. (eds), Proceedings of the 20th International Conference on Computers in Education. Singapore: Asia-Pacific Society for Computers in Education}, author = {Volodina, Elena and Hrafn, Loftsson and Arnbjörnsdóttir, Birna and Borin, Lars and Leifsson, Guðmundur Örn}, year = {2012}, volume = {2012}, ISBN = {978-981-07-4649-0}, } @edited_book{larsson-borin-2012-from-167661, title = {From Quantification to Conversation}, editor = {Larsson, Staffan and Borin, Lars}, year = {2012}, publisher = {College Publications}, address = {London}, ISBN = {978-1-84890-091-2}, } @incollection{borin-etal-2008-hunting-72504, title = {The hunting of the BLARK - SALDO, a freely available lexical database for Swedish language technology}, booktitle = {Resourceful language technology. Festschrift in honor of Anna Sågvall Hein}, author = {Borin, Lars and Forsberg, Markus and Lönngren, Lennart}, year = {2008}, publisher = {Uppsala University}, address = {Uppsala}, pages = {21--32}, } @inProceedings{volodina-borin-2012-developing-168523, title = {Developing an Open-Source Web-Based Exercise Generator for Swedish}, abstract = {This paper reports on the ongoing international project System architecture for ICALL and the progress made by the Swedish partner. The Swedish team is developing a web-based exercise generator reusing available annotated corpora and lexical resources. Apart from the technical issues like implementation of the user interface and the underlying processing machinery, a number of interesting pedagogical questions need to be solved, e.g., adapting learner-oriented exercises to proficiency levels; selecting authentic examples of an appropriate difficulty level; automatically ranking corpus examples by their quality; providing feedback to the learner, and selecting vocabulary for training domain-specific, academic or general-purpose vocabulary. In this paper we describe what has been done so far, mention the exercise types that can be generated at the moment as well as describe the tasks left for the future. }, booktitle = {CALL: Using, Learning, Knowing. EuroCALL Conference, Gothenburg, Sweden, 22-25 August 2012, Proceedings. Eds. Linda Bradley and Sylvie Thouësny. Research-publishing.net, Dublin, Ireland}, author = {Volodina, Elena and Borin, Lars}, year = {2012}, volume = {2012}, ISBN = {978-1-908416-03-2}, } @inProceedings{volodina-etal-2012-waste-165936, title = {Waste not, want not: Towards a system architecture for ICALL based on NLP component re-use}, booktitle = {Proceedings of the SLTC 2012 workshop on NLP for CALL, Lund, 25th October, 2012}, author = {Volodina, Elena and Borin, Lars and Loftsson, Hrafn and Arnbjörnsdóttir, Birna and Leifsson, Guðmundur Örn}, year = {2012}, pages = {47--58}, } @inProceedings{rama-borin-2012-properties-164449, title = {Properties of phoneme N -grams across the world’s language families}, abstract = {In this article, we investigate the properties of phoneme N -grams across half of the world’s languages. The sizes of three different N -gram distributions of the world’s language families obey a power law. Further, the N -gram distributions of language families parallel the sizes of the families, which also follow a power law distribution. The correlation between N -gram distributions and language family sizes improves with increasing values of N . The study also raises some new questions about the use of N -gram distributions in linguistic research, which we hope to be able to investigate in the future.}, booktitle = {Proceedings of the Fourth Swedish Language Technology Conference (SLTC)}, author = {Rama, Taraka and Borin, Lars}, year = {2012}, } @inProceedings{lyngfelt-etal-2012-adding-163582, title = {Adding a constructicon to the Swedish resource network of Språkbanken}, abstract = {This paper presents the integrated Swedish resource network of Språkbanken in general, and its latest addition – a constructicon – in particular. The constructicon, which is still in its early stages, is a collection of (partially) schematic multi-word units, constructions, developed as an addition to the Swedish FrameNet (SweFN). SweFN and the constructicon are integrated with other parts of Språkbanken, both lexical resources and corpora, through the lexical resource SALDO. In most respects, the constructicon is modeled on its English counterpart in Berkeley, and, thus, following the FrameNet format. The most striking differencies are the inclusion of so-called collostructional elements and the treatment of semantic roles, which are defined globally instead of locally as in FrameNet. Incorporating subprojects such as developing methods for automatic identification of constructions in authentic text on the one hand, and accounting for constructions problematic for L2 acquisition on the other, the approach is highly cross-disciplinary in nature, combining various theoretical linguistic perspectives on construction grammar with language technology, lexicography, and L2 research.}, booktitle = {11th Conference on Natural Language Processing (KONVENS) Proceedings}, author = {Lyngfelt, Benjamin and Borin, Lars and Forsberg, Markus and Prentice, Julia and Rydstedt, Rudolf and Sköldberg, Emma and Tingsell, Sofia}, year = {2012}, ISBN = {3-85027-005-X}, pages = {452--461}, } @book{borin-etal-2012-svenska-163410, title = {Svenska språket i den digitala tidsåldern}, author = {Borin, Lars and Brandt, Martha and Edlund, Jens and Lindh, Jonas and Parkvall, Mikael}, year = {2012}, publisher = {Springer}, address = {Berlin}, ISBN = {978-3-642-30831-4}, } @incollection{borin-2012-core-162377, title = {Core vocabulary: A useful but mystical concept in some kinds of linguistics}, booktitle = {Shall we play the festschrift game ? Essays on the Occasion of Lauri Carlson's 60th Birthday}, author = {Borin, Lars}, year = {2012}, publisher = {Springer}, address = {Berlin}, ISBN = {978-3-642-30772-0}, pages = {53--65}, } @inProceedings{borin-etal-2012-transferring-157213, title = {Transferring Frames: Utilization of Linked Lexical Resources}, abstract = {In our experiment, we evaluate the transferability of frames from Swedish to Finnish in parallel corpora. We evaluate both the theoretical possibility of transferring frames and the possibility of performing it using available lexical resources. We add the frame information to an extract of the Swedish side of the Kotus and JRC-Acquis corpora using an automatic frame labeler and copy it to the Finnish side. We focus on evaluating the results to get an estimation on how often the parallel sentences can be said to express the same frame. This sheds light to the questions: Are the same situations in the two languages expressed using different frames, i.e. are the frames transferable even in theory? How well can the frame information of running text be transferred from language to another? }, booktitle = {Proceedings of the Workshop on Inducing Linguistic Structure Submission (WILS)}, author = {Borin, Lars and Forsberg, Markus and Johansson, Richard and Muhonen, Kristiina and Purtonen, Tanja and Voionmaa, Kaarlo}, year = {2012}, pages = {8--15}, } @inProceedings{borin-etal-2012-search-157338, title = {Search Result Diversification Methods to Assist Lexicographers}, abstract = {We show how the lexicographic task of finding informative and diverse example sentences can be cast as a search result diversification problem, where an objective based on relevance and diversity is maximized. This problem has been studied intensively in the information retrieval community during recent years, and efficient algorithms have been devised. We finally show how the approach has been implemented in a lexicographic project, and describe the relevance and diversity functions used in that context. }, booktitle = {Proceedings of the 6th Linguistic Annotation Workshop}, author = {Borin, Lars and Forsberg, Markus and Friberg Heppin, Karin and Johansson, Richard and Kjellandsson, Annika}, year = {2012}, pages = {113--117}, } @inProceedings{rama-borin-2011-estimating-140688, title = {Estimating Language Relationships from a Parallel Corpus. A Study of the Europarl Corpus}, abstract = {Since the 1950s, linguists have been using short lists (40–200 items) of basic vocabulary as the central component in a methodology which is claimed to make it possible to automatically calculate genetic relationships among languages. In the last few years these methods have experienced something of a revival, in that more languages are involved, different distance measures are systematically compared and evaluated, and methods from computational biology are used for calculating language family trees. In this paper, we explore how this methodology can be extended in another direction, by using larger word lists automatically extracted from a parallel corpus using word alignment software. We present preliminary results from using the Europarl parallel corpus in this way for estimating the distances between some languages in the Indo-European language family.}, booktitle = {NEALT Proceedings Series (NODALIDA 2011 Conference Proceedings)}, author = {Rama, Taraka and Borin, Lars}, year = {2011}, volume = {11}, pages = {161--167}, } @inProceedings{pedersen-etal-2012-linking-155599, title = {Linking and validating Nordic and Baltic wordnets}, booktitle = {Proceedings of the 6th International Global Wordnet Conference}, author = {Pedersen, Bolette Sandford and Borin, Lars and Forsberg, Markus and Lindén, Krister and Orav, Heili and Rögnvaldsson, Eírikur}, year = {2012}, volume = {Accepted}, pages = {254--260}, } @inProceedings{borin-etal-2011-semantic-140686, title = {Semantic Search in Literature as an e-Humanities Research Tool: CONPLISIT – Consumption Patterns and Life-Style in 19th Century Swedish Literature}, abstract = {We present our ongoing work on language technology-based e-science in the humanities, with a focus on text-based research in the historical sciences. Currently, we are working on the adaptation and integration of lexical resources representing different historical stages of Swedish into a lexical and morphological toolbox that will allow us to develop semantically oriented text search applications for historical research on Swedish text. We describe a semantic search prototype which was built using REST web services from this toolbox as components, and which has been evaluated by historians interested in using digitized 19th century novels as primary data for an historical investigation of the emerging consumer society in 19th century Sweden.}, booktitle = {NEALT Proceedings Series (NODALIDA 2011 Conference Proceedings)}, author = {Borin, Lars and Forsberg, Markus and Ahlberger, Christer}, year = {2011}, volume = {11}, pages = {58--65}, } @inProceedings{saxena-borin-2011-dialect-140689, title = {Dialect Classification in the Himalayas: a Computational Approach}, abstract = {Linguistic fieldwork data – in the form of basic vocabulary lists – for nine closely related language varieties are compared using an automatic procedure with manual feedback, whose major advantage is its complete consistency. The results of the vocabulary comparison turn out to be in accord with other linguistic features, making this methodology a promising addition to the toolbox of genetic lingusitics.}, booktitle = {NEALT Proceedings Series (NODALIDA 2011 Conference Proceedings)}, author = {Saxena, Anju and Borin, Lars}, year = {2011}, volume = {11}, pages = {307--310}, } @article{borin-forsberg-2011-swesaurus-151331, title = {Swesaurus – ett svenskt ordnät med fria tyglar}, journal = {LexicoNordica}, author = {Borin, Lars and Forsberg, Markus}, year = {2011}, volume = {18}, pages = {17--39}, } @inProceedings{skadina-etal-2011-meta-148648, title = {META-NORD: Towards sharing of language resources in Nordic and Baltic countries}, abstract = {This paper introduces the META-NORD project which develops Nordic and Baltic part of the European open language resource infrastructure. META-NORD works on assembling, linking across languages, and making widely available the basic language resources used by developers, professionals and researchers to build specific products and applications. The goals of the project, overall approach and specific action lines on wordnets, terminology resources and treebanks are described. Moreover, results achieved in first five months of the project, i.e. language whitepapers, metadata specification and IPR management, are presented.}, booktitle = {Proceedings of the Workshop on Language Resources, Technology and Services in the Sharing Paradigm}, author = {Skadina, Inguna and Vasiljevs, Andrejs and Borin, Lars and De Smedt, Koenraad and Lindén, Krister and Rögnvaldsson, Eiríkur}, year = {2011}, pages = {107--114}, } @incollection{borin-forsberg-2011-diachronic-144291, title = {A diachronic computational lexical resource for 800 years of Swedish}, booktitle = {Language technology for cultural heritage}, author = {Borin, Lars and Forsberg, Markus}, year = {2011}, publisher = {Springer}, address = {Berlin}, ISBN = {978-3-642-20226-1}, pages = {41--61}, } @techreport{borin-etal-2011-metadata-142495, title = {Metadata descriptions and other interoperability standards}, abstract = {An important aim of META-NORD is to upgrade and harmonize national language resources and tools in order to make them interoperable, within languages and across languages, with respect to their data formats and as far as possible also as regards their content. Since resources and to some extent tools will remain in one location – one of a number of META-NORD centers – the preferred way of accessing and utilizing resources and tools will be through metadata and APIs, allowing the assembly of on-the-fly tool-chains made up of standardized component language technology tools, processing distributed – and in many cases interlinked – language resources in standardized formats.}, author = {Borin, Lars and Lindh, Jonas and Brandt, Martha and Olsson, Leif-Jöran}, year = {2011}, } @inProceedings{borin-etal-2010-past-110368, title = {The past meets the present in Swedish FrameNet++}, abstract = {The paper is about a recently initiated project which aims at the development of a Swedish FrameNet as an integral part of a larger lexical resource, hence the name “Swedish FrameNet++” (SweFN++). It focuses on reuse of free electronic resources and their role in the acquisition and population of Swedish frames. After a brief overview of Swedish resources, we reflect on three approaches to recycling the available lexical data in a semi-automatic manner. SweFN++ will be a multi-functional resource supporting research within lexicology and linguistics as well as different applications within computational lexicography and language technology, not to mention e-science.}, booktitle = {14th EURALEX International Congress}, author = {Borin, Lars and Dannélls, Dana and Forsberg, Markus and Toporowska Gronostaj, Maria and Kokkinakis, Dimitrios}, year = {2010}, pages = {269--281}, } @article{hammarstrom-borin-2011-unsupervised-141707, title = {Unsupervised learning of morphology}, journal = {Computational Linguistics}, author = {Hammarström, Harald and Borin, Lars}, year = {2011}, volume = {37}, number = {2}, pages = {309--350}, } @inProceedings{vasljevs-etal-2011-meta-140690, title = {META-NORD: Baltic and Nordic Branch of the European Open Linguistic Infrastructure}, booktitle = {Proceedings of the Nodalida 2011 Workshop on visibilty and availability of LT resources}, author = {Vasljevs, Andrejs and Pedersen, Bolette Sandford and De Smedt, Koenraad and Borin, Lars and Skadina, Inguna}, year = {2011}, } @incollection{borin-2010-avtryck-136656, title = {Avtryck från WGLN-projekten i forskningen}, booktitle = {Kunskapens nya världar}, author = {Borin, Lars}, year = {2010}, publisher = {Uppsala universitet, Uppsala Learning Lab}, address = {Uppsala}, ISBN = {978-91-506-2189-1}, pages = {127--133}, } @inProceedings{andreasson-etal-2009-swedish-102211, title = {Swedish CLARIN activities}, booktitle = {Proceedings of the Nodalida 2009 workshop on CLARIN activities in the Nordic countries. NEALT Proceedings Series}, author = {Andréasson, Maia and Borin, Lars and Forsberg, Markus and Beskow, Jonas and Carlson, Rolf and Edlund, Jens and Elenius, Kjell and Hellmer, Kahl and House, David and Merkel, Magnus and Forsbom, Eva and Megyesi, Beáta and Eriksson, Anders and Strömqvist, Sven}, year = {2009}, volume = {5}, pages = {1--5}, } @article{borin-2010-zipf-130257, title = {Med Zipf mot framtiden - en integrerad lexikonresurs för svensk språkteknologi}, journal = {LexicoNordica}, author = {Borin, Lars}, year = {2010}, volume = {17}, pages = {35--54}, } @article{borin-etal-2010-swedish-129126, title = {Swedish FrameNet++}, journal = {Swedish Language Technology Conference 2010}, author = {Borin, Lars and Dannélls, Dana and Forsberg, Markus and Toporowska Gronostaj, Maria and Kokkinakis, Dimitrios}, year = {2010}, } @article{borin-forsberg-2010-beyond-129125, title = {Beyond the synset: Swesaurus – a fuzzy Swedish wordnet}, journal = {Re-thinking synonymy: semantic sameness and similarity in languages and their description}, author = {Borin, Lars and Forsberg, Markus}, year = {2010}, } @inProceedings{borin-forsberg-2010-from-118908, title = {From the People’s Synonym Dictionary to fuzzy synsets - first steps}, booktitle = {Proceedings of the LREC 2010 workshop Semantic relations. Theory and Applications}, author = {Borin, Lars and Forsberg, Markus}, year = {2010}, pages = {18--25}, } @incollection{borin-kokkinakis-2010-literary-124517, title = {Literary onomastics and language technology}, booktitle = {Literary education and digital learning}, author = {Borin, Lars and Kokkinakis, Dimitrios}, year = {2010}, publisher = {Information Science Reference}, address = {Hershey - New York}, ISBN = {978-1-60566-932-8}, pages = {53--78}, } @inProceedings{wittenburg-etal-2010-resource-118909, title = {Resource and service centres as the backbone for a sustainable service infrastructure}, booktitle = {Proceedings of LREC 2010}, author = {Wittenburg, Peter and Bel, Nuria and Borin, Lars and Budin, Gerhard and Calzolari, Nicoletta and Hajicova, Eva and Koskenniemi, Kimmo and Lemnitzer, Lothar and Mægaard, Bente and Piasecki, Maciej and Pierrel, Jean-Marie and Piperidis, Stelios and Skadina, Inguna and Tufis, Dan and van Veenendal, Remco and Váradi, Tamás and Wynne, Martin}, year = {2010}, } @inProceedings{borin-etal-2010-diabase-118907, title = {Diabase: Towards a diachronic BLARK in support of historical studies}, booktitle = {Proceedings of LREC 2010}, author = {Borin, Lars and Forsberg, Markus and Kokkinakis, Dimitrios}, year = {2010}, } @inProceedings{borin-olsson-2006-plattformen-116093, title = {ITG-plattformen som korpusverktyg}, abstract = {En genomgång och handfast presentation om hur ITG-plattformen kan användas som korpusverktyg.}, booktitle = {Fjärde svenska lingvistikkonferensen (Sling 2006), 27–28 april 2006, Stockholm}, author = {Borin, Lars and Olsson, Leif-Jöran}, year = {2006}, } @incollection{borin-prutz-2004-wine-33945, title = {New wine in old skins? A corpus investigation of L1 syntactic transfer in learner language}, booktitle = {Aston, G., Bernardini, S. & Stewart, D. (eds). Corpora and language learners}, author = {Borin, Lars and Prütz, Klas}, year = {2004}, publisher = {John Benjamins}, address = {Amsterdam}, ISBN = {90-272-2288-6}, pages = {67--87}, } @inProceedings{borin-etal-2007-medical-44951, title = {Medical frames as target and tool}, booktitle = {FRAME 2007: Building Frame Semantics resources for Scandinavian and Baltic languages. (Nodalida 2007 workshop proceedings)}, author = {Borin, Lars and Toporowska Gronostaj, Maria and Kokkinakis, Dimitrios}, year = {2007}, ISBN = {978-91-976939-0-5}, pages = {11--18}, } @techreport{ahlfelt-etal-2006-literature-34047, title = {Literature Review on Patient_Friendly Documentation Systems}, author = {Åhlfelt, Hans and Borin, Lars and Daumke, Philipp and Grabar, Natalia and Hallett, Catalina and Hardcastle, david and Kokkinakis, Dimitrios and Mancini, Clara and Marko, Kornel and Merkel, Magnus and Pietsch, Christian and Power, Richard and Scott, Donia and Silvervarg, Annika and Toporowska Gronostaj, Maria and Williams, Sandra and Willis, Alistair}, year = {2006}, publisher = {Göteborg University}, address = {Göteborg}, } @techreport{borin-etal-2007-empowering-53590, title = {Empowering the patient with language technology}, author = {Borin, Lars and Grabar, Natalia and Hallett, Catalina and Hardcastle, david and Toporowska Gronostaj, Maria and Kokkinakis, Dimitrios and Williams, Sandra and Willis, Alistair}, year = {2007}, publisher = {Göteborg University}, address = {Göteborg}, } @inProceedings{borin-etal-2009-thinking-110343, title = {Thinking Green: Toward Swedish FrameNet++}, abstract = {Access to multi-layered lexical, grammatical and semantic information representing text content is a prerequisite for efficient automatic understanding and generation of natural language. A FrameNet is considered a valuable resource for both linguistics and language technology research that may contribute to the achievement of these goals. Currently, FrameNet-like resources exist for a few languages,1 including some domain-specific and multilingual initiatives (Dolbey et al., 2006; Boas, 2009; Uematsu et al., 2009; Venturi et al., 2009), but are unavailable for most languages, including Swedish, although there have been some pilot studies exploring the semi-automatic acquisition of Swedish frames (Johansson & Nugues, 2006; Borin et al., 2007). At the University of Gothenburg, we are now embarking on a project to build a Swedish FrameNet-like resource. A novel feature of this project is that the Swedish FrameNetwill be an integral part of a largermany-faceted lexical resource. Hence the name Swedish FrameNet++ (SweFN++). }, booktitle = {FrameNet Masterclass and Workshop}, author = {Borin, Lars and Dannélls, Dana and Forsberg, Markus and Toporowska Gronostaj, Maria and Kokkinakis, Dimitrios}, year = {2009}, } @incollection{borin-2006-supporting-33863, title = {Supporting lesser-known languages: The promise of language technology}, booktitle = {Saxena, A. & Borin, L. (eds). Lesser-known languages of South Asia. Status and policies, case studies and applications of information technology}, author = {Borin, Lars}, year = {2006}, publisher = {Mouton de Gruyter}, address = {Berlin}, ISBN = {3-11-018976-3}, pages = {317--337}, } @inProceedings{marko-etal-2006-towards-40540, title = {Towards a multilingual medical lexicon}, booktitle = {Proceedings of the American Medical Informatics Association Symposium (AMIA '06)}, author = {Markó, Kornél and Baud, Robert and Zweigenbaum, Pierre and Borin, Lars and Merkel, Magnus and Schulz, Stefan}, year = {2006}, pages = {534--538}, } @incollection{borin-2004-language-33976, title = {Language technology resources for less prevalent languages: Will the Münchhausen Model work?}, booktitle = {Holmboe, H. (ed). Nordisk sprogteknologi 2003. Nordic language technology. Årbog for Nordisk Sprogteknologisk Forskningsprogram 2000-2004}, author = {Borin, Lars}, year = {2004}, publisher = {Museum Tusculanums Forlag}, address = {København}, ISBN = {87-7289-997-2}, pages = {71--82}, } @incollection{borin-saxena-2004-grammar-33944, title = {Grammar, incorporated}, booktitle = {Henrichsen, P. J. (ed). CALL for the Nordic languages}, author = {Borin, Lars and Saxena, Anju}, year = {2004}, publisher = {Samfundslitteratur}, address = {Frederiksberg}, ISBN = {87-593-1176-2}, pages = {125--145}, } @inProceedings{baud-etal-2005-interchanging-33867, title = {Interchanging lexical information for a multilingual dictionary}, booktitle = {AMIA 2005 Proceedings}, author = {Baud, Robert and Nyström, Mikael and Borin, Lars and Evans, Roger and Schulz, Stefan and Zweigenbaum, Pierre}, year = {2005}, pages = {31--35}, } @article{borin-2005-mannen-33865, title = {Mannen är faderns mormor: Svenskt associationslexikon reinkarnerat}, journal = {LexicoNordica}, author = {Borin, Lars}, year = {2005}, volume = {12}, pages = {39--54}, } @edited_book{saxena-borin-2006-lesser-33862, title = {Lesser-known languages of South Asia. Status and policies, case studies and applications of information technology}, editor = {Saxena, Anju and Borin, Lars}, year = {2006}, publisher = {Mouton de Gruyter}, address = {Berlin}, ISBN = {3-11-018976-3}, } @incollection{borin-2006-gar-33864, title = {Vi som går köksvägen: Språkteknologer och korpuslingvister i Litteraturbanken}, booktitle = {Börjesson, M. (red). Fältanteckningar: Utbildnings- och kultursociologiska texter tillägnade Donald Broady}, author = {Borin, Lars}, year = {2006}, publisher = {Forskningsgruppen för utbildnings- och kultursociologi (ILU), Uppsala universitet}, address = {Uppsala}, ISBN = {91-631-8807-4}, pages = {399--404}, } @incollection{borin-2006-sparv-44950, title = {Sparv i tranedansen eller fisken i vattnet? Språkteknologi och språklärande}, booktitle = {Från vision till praktik: Språkutbildning och informationsteknik}, author = {Borin, Lars}, year = {2006}, publisher = {NSHU - Myndigheten för nätverk och samarbete inom högre utbildning}, address = {Härnösand}, ISBN = {978-91-975425-8-6}, pages = {25--49}, } @inProceedings{borin-etal-2007-naming-44954, title = {Naming the past: Named entity and animacy recognition in 19th century Swedish literature}, booktitle = {ACL 2007 Workshop on Language Technology for Cultural Heritage Data (LaTeCH 2007)}, author = {Borin, Lars and Kokkinakis, Dimitrios and Olsson, Leif-Jöran}, year = {2007}, pages = {1--8}, } @inProceedings{borin-forsberg-2008-something-72502, title = {Something old, something new: A computational morphological description of Old Swedish}, booktitle = {LREC 2008 Workshop on Language Technology for Cultural Heritage Data (LaTeCH 2008)}, author = {Borin, Lars and Forsberg, Markus}, year = {2008}, pages = {9--16}, } @article{borin-2008-review-72506, title = {Review of Stig Johansson: Seeing through multilingual corpora: On the use of corpora in contrastive studies}, journal = {ICAME Journal}, author = {Borin, Lars}, year = {2008}, volume = {32}, pages = {261--267}, } @incollection{borin-2008-lemma-72507, title = {Lemma, lexem eller mittemellan? Ontologisk ångest i den digitala domänen}, booktitle = {Nog ordat? Festskrift till Sven-Göran Malmgren}, author = {Borin, Lars}, year = {2008}, publisher = {University of Gothenburg}, address = {Göteborg}, pages = {59--67}, } @edited_book{lendvai-borin-2009-proceedings-91853, title = {Proceedings of the EACL 2009 Workshop on Language Technology and Resources for Cultural Heritage, Social Sciences, Humanities, and Education (LaTeCH -- SHELT&R 2009)}, editor = {Lendvai, Piroska and Borin, Lars}, year = {2009}, publisher = {ACL}, address = {Athens}, ISBN = {1-932432-21-3}, } @inProceedings{borin-2009-linguistic-102209, title = {Linguistic diversity in the information society}, booktitle = {Proceedings of the SALTMIL 2009 workshop on Information Retrieval and Information Extraction for Less Resourced Languages}, author = {Borin, Lars}, year = {2009}, ISBN = {978-84-692-4940-6}, pages = {1--7}, } @inProceedings{borin-forsberg-2009-family-102212, title = {All in the family: A comparison of SALDO and WordNet}, booktitle = {Proceedings of the Nodalida 2009 Workshop on WordNets and other Lexical Semantic Resources - between Lexical Semantics, Lexicography, Terminology and Formal Ontologies. NEALT Proceedings Series}, author = {Borin, Lars and Forsberg, Markus}, year = {2009}, volume = {7}, } @techreport{borin-2009-bush-102214, title = {One in the bush: Low-density language technology}, author = {Borin, Lars}, year = {2009}, publisher = {University of Gothenburg}, address = {Göteborg}, } @techreport{andreasson-etal-2008-habeas-102220, title = {Habeas Corpus: A survey for SNK - a Swedish national corpus}, author = {Andréasson, Maia and Borin, Lars and Merkel, Magnus}, year = {2008}, publisher = {University of Gothenburg}, address = {Göteborg}, } @article{borin-etal-2008-saldo-110525, title = {SALDO 1.0 (Svenskt associationslexikon version 2)}, journal = {Språkbanken, Göteborg universitet}, author = {Borin, Lars and Forsberg, Markus and Lönngren, Lennart}, year = {2008}, }