@inProceedings{berdicevskis-etal-2023-superlim-331445, title = {Superlim: A Swedish Language Understanding Evaluation Benchmark}, booktitle = {Proceedings of the 2023 Conference on Empirical Methods in Natural Language Processing, December 6-10, 2023, Singapore / Houda Bouamor, Juan Pino, Kalika Bali (Editors)}, author = {Berdicevskis, Aleksandrs and Bouma, Gerlof and Kurtz, Robin and Morger, Felix and Öhman, Joey and Adesam, Yvonne and Borin, Lars and Dannélls, Dana and Forsberg, Markus and Isbister, Tim and Lindahl, Anna and Malmsten, Martin and Rekathati, Faton and Sahlgren, Magnus and Volodina, Elena and Börjeson, Love and Hengchen, Simon and Tahmasebi, Nina}, year = {2023}, publisher = {Association for Computational Linguistics}, address = {Stroudsburg, PA}, ISBN = {979-8-89176-060-8}, pages = {8137--8153}, } @incollection{borin-etal-2023-language-337444, title = {Language Report Swedish}, abstract = {Swedish speech and language technology (LT) research goes back over 70 years. This has paid off: there is a national research infrastructure, as well as significant research projects, and Swedish is well-endowed with language resources (LRs) and tools. However, there are gaps that need to be filled, especially high-quality goldstandard LRs required by the most recent deep-learning methods. In the future, we would like to see closer collaborations and communication between the “traditional” LT research community and the burgeoning AI field, the establishment of dedicated academic LT training programmes, and national funding for LT research.}, booktitle = {Cognitive Technologies}, author = {Borin, Lars and Domeij, Rickard and Edlund, Jens and Forsberg, Markus}, year = {2023}, pages = {219--222}, } @misc{volodina-etal-2024-proceedings-335190, title = {Proceedings of the Huminfra Conference (HiC 2024), 10-11 January, 2024, Gothenburg, Sweden}, author = {Volodina, Elena and Bouma, Gerlof and Forsberg, Markus and Kokkinakis, Dimitrios and Alfter, David and Fridlund, Mats and Horn, Christian and Ahrenberg, Lars and Blåder, Anna}, year = {2024}, publisher = {Linköping University Electronic Press}, address = {Linköping}, ISBN = {978-91-8075-512-2}, } @inProceedings{bouma-etal-2024-konsten-333683, title = {Konsten att bedriva svensk ordforskning utan att kränka upphovsrätten}, abstract = {Vi beskriver KB-labb och Språkbanken Texts samarbete för att underlätta ordforskning på de upphovsrätts-skyddade korpusar som finns i Kungliga bibliotekets samlingar. Satsningen har hittils lett till två öppna datasamlingar, Kubord 1 och 2, som ger tillgång till ordstatistik och ordsamförekomststatistik. Vi beskriver även Kubord-fastText, en samling vektormodeller som är baserade på samma korpusar, som är underutveckling}, booktitle = {Proceedings of the Huminfra Conference (HiC 2024), Gothenburg, 10–11 January, 2024 / eds. Elena Volodina, Gerlof Bouma, Markus Forsberg, Dimitrios Kokkinakis, David Alfter, Mats Fridlund, Christian Horn, Lars Ahrenberg, Anna Blåder}, author = {Bouma, Gerlof and Forsberg, Markus and Sikora, Justyna and Sköldberg, Emma}, year = {2024}, publisher = { Linköping University Electronic Press}, address = {Linköping }, ISBN = {978-91-8075-512-2}, } @inProceedings{forsberg-hulden-2016-deriving-237061, title = {Deriving Morphological Analyzers from Example Inflections}, abstract = {This paper presents a semi-automatic method to derive morphological analyzers from a limited number of example inflections suitable for languages with alphabetic writing systems. The system we present learns the inflectional behavior of morphological paradigms from examples and converts the learned paradigms into a finite-state transducer that is able to map inflected forms of previously unseen words into lemmas and corresponding morphosyntactic descriptions. We evaluate the system when provided with inflection tables for several languages collected from the Wiktionary.}, booktitle = {Proceedings of the Tenth International Conference on Language Resources and Evaluation (LREC-2016) May 23-28, 2016, Portorož, Slovenia}, author = {Forsberg, Markus and Hulden, Mans}, year = {2016}, ISBN = {978-2-9517408-9-1}, } @inProceedings{virk-etal-2021-data-306964, title = {A Data-Driven Semi-Automatic Framenet Development Methodology }, abstract = {FrameNet is a lexical semantic resource based on the linguistic theory of frame semantics. A number of framenet development strategies have been reported previously and all of them involve exploration of corpora and a fair amount of manual work. Despite previous efforts, there does not exist a well-thought-out automatic/semi-automatic methodology for frame construction. In this paper we propose a data-driven methodology for identification and semi-automatic construction of frames. As a proof of concept, we report on our initial attempts to build a wider-scale framenet for the legal domain (LawFN) using the proposed methodology. The constructed frames are stored in a lexical database and together with the annotated example sentences they have been made available through a web interface.}, booktitle = {Proceedings of the International Conference on Recent Advances in Natural Language Processing, 1–3 September, 2021 / Edited by Galia Angelova, Maria Kunilovskaya, Ruslan Mitkov, Ivelina Nikolova-Koleva}, author = {Virk, Shafqat and Dannélls, Dana and Borin, Lars and Forsberg, Markus}, year = {2021}, publisher = {INCOMA}, address = {Shoumen, Bulgaria}, ISBN = {978-954-452-072-4}, } @misc{forsberg-etal-2023-words-328244, title = {Words unboxed: discovering new words with Kubord}, author = {Forsberg, Markus and Sikora, Justyna and Sköldberg, Emma}, year = {2023}, publisher = {Kungliga biblioteket}, number = { 2023-08-29}, address = {Stockholm}, } @inProceedings{ahlberg-etal-2016-karp-246072, title = {Karp: Språkbanken’s Open Lexical Infrastructure}, booktitle = {Globalex 2016, May 24, Portorož, Slovenia}, author = {Ahlberg, Malin and Borin, Lars and Forsberg, Markus and Olsson, Olof and Schumacher, Anne and Uppström, Jonatan}, year = {2016}, } @techreport{hammarstedt-etal-2022-sparv-318399, title = {Sparv 5 Developer’s Guide}, abstract = {The Sparv Pipeline developed by Språkbanken Text is a text analysis tool run from the command line. This Developer’s Guide describes its general structure and key concepts and serves as an API documentation. Most importantly, it describes how to write plugins for Sparv 5 so that you can add your own functions to the toolkit.}, author = {Hammarstedt, Martin and Schumacher, Anne and Borin, Lars and Forsberg, Markus}, year = {2022}, } @inProceedings{ahlberg-etal-2013-korp-178355, title = {Korp and Karp – a bestiary of language resources: the research infrastructure of Språkbanken}, abstract = {A central activity in Språkbanken, an R&D unit at the University of Gothenburg, is the systematic construction of a research infrastructure based on interoperability and widely accepted standards for metadata and data. The two main components of this infrastructure deal with text corpora and with lexical resources. For modularity and flexibility, both components have a backend, or server-side part, accessed through an API made up of a set of well-defined web services. This means that there can be any number of different user interfaces to these components, corresponding, e.g., to different research needs. Here, we will demonstrate the standard corpus and lexicon search interfaces, designed primarily for linguistic searches: Korp and Karp.}, booktitle = {Proceedings of the 19th Nordic Conference of Computational Linguistics (NODALIDA 2013), May 22–24, 2013, Oslo University, Norway. NEALT Proceedings Series 16}, author = {Ahlberg, Malin and Borin, Lars and Forsberg, Markus and Hammarstedt, Martin and Olsson, Leif-Jöran and Olsson, Olof and Roxendal, Johan and Uppström, Jonatan}, year = {2013}, publisher = {Linköping University Electronic Press}, address = {Linköping}, } @inProceedings{borin-etal-2013-lexical-186032, title = {The lexical editing system of Karp}, abstract = {Karp is the open lexical infrastructure of Språkbanken (the Swedish Language Bank). The infrastructure has three main functions: (1) to support the work on creating, curating, and integrating our various lexical resources; (2) to publish the resources, making them searchable and downloadable; and (3) to offer advanced editing functionalities. An important feature of the lexical infrastructure is also that we maintain a strong bidirectional connection to our corpus infrastructure. At the heart of the infrastructure is the SweFN++ project with the goal to create free Swedish lexical resources geared towards language technology applications. The infrastructure currently hosts 23 Swedish lexical resources. The resources are integrated through links to a pivot lexical resource, SALDO, a large morphological and lexical-semantic resource for modern Swedish.}, booktitle = {Kosem, I., Kallas, J., Gantar, P., Krek, S., Langemets, M., Tuulik, M. (eds.) 2013. Electronic lexicography in the 21st century: thinking outside the paper. Proceedings of the eLex 2013 conference, 17-19 October 2013, Tallinn, Estonia.}, author = {Borin, Lars and Forsberg, Markus and Olsson, Leif-Jöran and Olsson, Olof and Uppström, Jonatan}, year = {2013}, publisher = {Trojina, Institute for Applied Slovene Studies / Eesti Keele Instituut }, address = {Ljubljana/Tallinn}, ISBN = { 978-961-93594-0-2}, } @techreport{hammarstedt-etal-2022-sparv-318405, title = {Sparv 5 User Manual}, abstract = {The Sparv Pipeline developed by Språkbanken Text is a text analysis tool run from the command line. This user manual describes how to get Sparv 5 up and running on your own machine, how to configure it and how to use it for annotating your own corpora.}, author = {Hammarstedt, Martin and Schumacher, Anne and Borin, Lars and Forsberg, Markus}, year = {2022}, publisher = {Institutionen för svenska, flerspråkighet och språkteknologi}, address = {Göteborg}, } @inProceedings{ahlberg-etal-2016-sprakbanken's-246063, title = {Språkbanken’s Open Lexical Infrastructure}, abstract = {Karp is an open lexical infrastructure and a web based tool for searching, exploring and developing lexical resources. Språkbanken currently hosts a number of lexicons in Karp and on-going work aims at broadening the type of resources that can be developed in the system. This abstract gives a short overview of Karp's basic functionality, and describes some current projects and on-going work.}, booktitle = {SLTC 2016. The Sixth Swedish Language Technology Conference. Umeå University, 17-18 November, 2016}, author = {Ahlberg, Malin and Borin, Lars and Forsberg, Markus and Olsson, Olof and Schumacher, Anne and Uppström, Jonatan}, year = {2016}, } @incollection{borin-etal-2021-swedish-311385, title = {Swedish FrameNet++ – lexical samsara}, booktitle = {The Swedish FrameNet++: Harmonization, integration, method development and practical language technology applications / editor(s): Dana Dannélls, Lars Borin and Karin Friberg Heppin}, author = {Borin, Lars and Forsberg, Markus and Lönngren, Lennart and Zechner, Niklas}, year = {2021}, publisher = {John Benjamins}, address = {Amsterdam}, ISBN = {9789027209900}, pages = {69–95}, } @edited_book{volodina-etal-2022-live-320415, title = {Live and Learn- Festschrift in honor of Lars Borin}, abstract = {This Festschrift has been compiled to honor Professor Lars Borin on his 65th anniversary. It consists of 30 articles which reflect a fraction of Lars’ scholarly interests within computational linguistics and related fields. They come from his friends and colleagues around the world and deal with topics that have been – in one way or another – inspired by his work. A common theme for the articles is the never-ending need to learn, which is alluded to in the title of the volume, Live and Learn.}, editor = {Volodina, Elena and Dannélls, Dana and Berdicevskis, Aleksandrs and Forsberg, Markus and Virk, Shafqat}, year = {2022}, publisher = {Institutionen för svenska, flerspråkighet och språkteknologi, Göteborgs universitet}, address = {Göteborg}, ISBN = {978-91-87850-83-7}, } @incollection{forsberg-skoldberg-2022-ordvektorer-320472, title = {Ordvektorer i lexikografiskt arbete}, abstract = {We present a preliminary case study on the use of word vectors in lexicographic practice. The study shows the potential of using vector models in the revision of existing dictionary entries as well as creating new entries.}, booktitle = {Live and learn. Festschrift in honor of Lars Borin (eds. Elena Volodina, Dana Dannélls, Aleksandrs Berdicevskis, Markus Forsberg & Shafqat Virk)}, author = {Forsberg, Markus and Sköldberg, Emma}, year = {2022}, publisher = {Department of Swedish, Multilingualism, Language Technology}, address = {Gothenburg}, ISBN = {978-91-87850-82-0}, pages = {37--41}, } @incollection{linden-etal-2021-multilingual-311386, title = {A multilingual net of lexical resources}, booktitle = {The Swedish FrameNet++: Harmonization, integration, method development and practical language technology applications / editor(s): Dana Dannélls, Lars Borin and Karin Friberg Heppin}, author = {Lindén, Krister and Niemi, Jyrki and Borin, Lars and Forsberg, Markus and Pedersen, Bolette S. and Nimb, Sanni and Orav, Heili and Kahusk, Neeme and Vider, Kadri}, year = {2021}, publisher = {John Benjamins}, address = {Amsterdam}, ISBN = {9789027209900}, pages = {123–137}, } @incollection{dannells-etal-2021-swedish-310041, title = {Swedish FrameNet}, abstract = {This chapter describes the development of Swedish FrameNet. A new framenet project often follows one of two methodological approaches: (1) extension, through translation of a different-language – often English – framenet into the target language, and (2) merging, where the resource is built from scratch in the target language. Both approaches have their pros and cons, which have been extensively discussed in the literature. Swedish FrameNet is mainly developed through the extension approach, although balanced with the merging approach. Drawing on the two approaches simultaneously, we describe how integrated language resources and tools have been exploited to create and develop Swedish FrameNet: how it was constructed, what it contains, and the basic assumptions underlying the annotation of its contents. }, booktitle = {The Swedish FrameNet++. Harmonization, integration, method development and practical language technology applications}, author = {Dannélls, Dana and Borin, Lars and Forsberg, Markus and Friberg Heppin, Karin and Toporowska Gronostaj, Maria}, year = {2021}, publisher = {John Benjamins Publishing Company}, address = {Amsterdam / Philadelphia}, ISBN = {978 90 272 5848 9}, pages = {37 -- 66}, } @inProceedings{virk-etal-2020-dream-295338, title = {The DReaM Corpus: A Multilingual Annotated Corpus of Grammars for the World’s Languages}, booktitle = {Proceedings of the 12th Conference on Language Resources and Evaluation (LREC 2020), Marseille, 11–16 May 2020 / Editors : Nicoletta Calzolari, Frédéric Béchet, Philippe Blache, Khalid Choukri, Christopher Cieri, Thierry Declerck, Sara Goggi, Hitoshi Isahara, Bente Maegaard, Joseph Mariani, Hélène Mazo, Asuncion Moreno, Jan Odijk, Stelios Piperidis}, author = {Virk, Shafqat and Hammarström, Harald and Forsberg, Markus and Wichmann, Søren }, year = {2020}, publisher = {European Language Resources Association}, address = {Paris}, ISBN = {979-10-95546-34-4 }, } @inProceedings{virk-etal-2020-from-295339, title = {From Linguistic Descriptions to Language Profiles}, abstract = {Language catalogues and typological databases are two important types of resources containing different types of knowledge about the world’s natural languages. The former provide metadata such as number of speakers, location (in prose descriptions and/or GPS coordinates), language code, literacy, etc., while the latter contain information about a set of structural and functional attributes of languages. Given that both types of resources are developed and later maintained manually, there are practical limits as to the number of languages and the number of features that can be surveyed. We introduce the concept of a language profile, which is intended to be a structured representation of various types of knowledge about a natural language extracted semi-automatically from descriptive documents and stored at a central location. It has three major parts: (1) an introductory; (2) an attributive; and (3) a reference part, each containing different types of knowledge about a given natural language. As a case study, we develop and present a language profile of an example language. At this stage, a language profile is an independent entity, but in the future it is envisioned to become part of a network of language profiles connected to each other via various types of relations. Such a representation is expected to be suitable both for humans and machines to read and process for further deeper linguistic analyses and/or comparisons.}, booktitle = {Proceedings of the 7th Workshop on Linked Data in Linguistics (LDL-2020). Language Resources and Evaluation Conference (LREC 2020), Marseille, 11–16 May 2020 / Edited by : Maxim Ionov, John P. McCrae, Christian Chiarcos, Thierry Declerck, Julia Bosque-Gil, and Jorge Gracia}, author = {Virk, Shafqat and Hammarström, Harald and Borin, Lars and Forsberg, Markus and Wichmann, Søren }, year = {2020}, publisher = {European Language Resources Association}, address = {Paris}, ISBN = {979-10-95546-36-8}, } @article{sandberg-etal-2019-issue-285614, title = {Issue Salience on Twitter During Swedish Party Leaders’ Debates }, abstract = {The objective of this study is to contribute knowledge about formation of political agendas on Twitter during mediated political events, using the party leaders’ debates in Sweden before the general election of 2014 as a case study. Our findings show that issues brought up during the debates were largely mirrored on Twitter, with one striking discrepancy. Contrary to our expectations, issues on the left-right policy dimension were more salient on Twitter than in the debates, whereas issues such as the environment, immigration and refugees, all tied to a liberal-authoritarian value axis, were less salient on Twitter.}, journal = {Nordicom Review}, author = {Sandberg, Linn and Bjereld, Ulf and Bunyik, Karina and Forsberg, Markus and Johansson, Richard}, year = {2019}, volume = {40}, number = {2}, pages = {49--61}, } @article{smith-etal-2014-readability-188146, title = {Readability, suitability and comprehensibility in patient education materials for Swedish patients with colorectal cancer undergoing elective surgery: A mixed method design.}, abstract = {To characterize education materials provided to patients undergoing colorectal cancer surgery to gain a better understanding of how to design readable, suitable, comprehensible materials.}, journal = {Patient education and counseling}, author = {Smith, Frida and Carlsson, Eva and Kokkinakis, Dimitrios and Forsberg, Markus and Kodeda, Karl and Sawatzky, Richard and Friberg, Febe and Öhlén, Joakim}, year = {2014}, volume = {94}, number = {2}, pages = {202–209}, } @inProceedings{adesam-etal-2018-eukalyptus-273839, title = {The Eukalyptus Treebank of Written Swedish}, booktitle = {Seventh Swedish Language Technology Conference (SLTC), Stockholm, 7–9 November 2018}, author = {Adesam, Yvonne and Bouma, Gerlof and Johansson, Richard and Borin, Lars and Forsberg, Markus}, year = {2018}, } @inProceedings{hammarstrom-etal-2017-poor-261851, title = {Poor man's OCR post-correction: Unsupervised recognition of variant spelling applied to a multilingual document collection}, abstract = {© 2017 Copyright held by the owner/author(s). The accuracy of Optical Character Recognition (OCR) is sets the limit for the success of subsequent applications used in text analyzing pipeline. Recent models of OCR postprocessing significantly improve the quality of OCR-generated text but require engineering work or resources such as humanlabeled data or a dictionary to perform with such accuracy on novel datasets. In the present paper we introduce a technique for OCR post-processing that runs off-the-shelf with no resources or parameter tuning required. In essence, words which are similar in form that are also distributionally more similar than expected at random are deemed OCR-variants. As such it can be applied to any language or genre (as long as the orthography segments the language at the word-level). The algorithm is illustrated and evaluated using a multilingual document collection and a benchmark English dataset.}, booktitle = {DATeCH2017, Proceedings of the 2nd International Conference on Digital Access to Textual Cultural Heritage, Göttingen, Germany — June 01 - 02, 2017 }, author = {Hammarström, Harald and Virk, Shafqat and Forsberg, Markus}, year = {2017}, publisher = {Association for Computing Machinery (ACM)}, address = {New York}, ISBN = {978-1-4503-5265-9}, } @techreport{hammarstedt-etal-2017-korp-256055, title = {Korp 6 - Technical Report}, author = {Hammarstedt, Martin and Roxendal, Johan and Öhrman, Maria and Borin, Lars and Forsberg, Markus and Schumacher, Anne}, year = {2017}, publisher = {Institutionen för svenska språket, Göteborgs universitet}, } @techreport{hammarstedt-etal-2017-korp-256056, title = {Korp 6 - Användarmanual}, author = {Hammarstedt, Martin and Borin, Lars and Forsberg, Markus and Roxendal, Johan and Schumacher, Anne and Öhrman, Maria}, year = {2017}, publisher = {Institutionen för svenska språket, Göteborgs universitet}, } @inProceedings{nord-forsberg-2017-enklare-259902, title = {Enklare efter klarspråk? Myndighetstexter före och efter ett klarspråksprojekt}, booktitle = {Saga Bendegard, Ulla Melander Marttala & Maria Westman (red.), Språk och norm: Rapport från ASLA:s symposium, Uppsala universitet 21–22 april 2016}, author = {Nord, Andreas and Forsberg, Markus}, year = {2017}, publisher = {ASLA}, address = {Uppsala}, ISBN = {978-91-87884-26-9}, } @article{smith-etal-2012-studie-170897, title = {Ny studie visar hur information till patienter med kolorektal cancer kan förbättras}, abstract = {Skriftligt informationsmaterial är ofta skrivet på för hög nivå och ställer höga krav på den tänkta läsaren (patienten). Förutom läsbarhet finns det fler faktorer att utvärdera för att se om materialet är lämpligt. Innehåll, struktur, layout och typsnitt, illustrationer och lärande och motivation är sådant som bör tas hänsyn till. Ett lämpligare, bättre anpassat material kan hjälpa personer med sjukdom att ställa bättre frågor när de har samtal med vårdpersonal och det kan göra personen mindre osäker och orolig för det okända som väntar. En ny studie som ingår i forskningsprojektet PINCORE (personcentred information and communication in colorectal cancer care) syftar till att förbättra information och kommunikation vid kolorektal cancer.}, journal = {Cancervården}, author = {Smith, Frida and Öhlén, Joakim and Carlsson, Eva and Friberg, Febe and Forsberg, Markus and Kokkinakis, Dimitrios}, year = {2012}, number = {5}, pages = {18--21}, } @inProceedings{ahlberg-etal-2015-paradigm-217987, title = {Paradigm classification in supervised learning of morphology}, abstract = {Supervised morphological paradigm learning by identifying and aligning the longest common subsequence found in inflection tables has recently been proposed as a simple yet competitive way to induce morphological patterns. We combine this non-probabilistic strategy of inflection table generalization with a discriminative classifier to permit the reconstruction of complete inflection tables of unseen words. Our system learns morphological paradigms from labeled examples of inflection patterns (inflection tables) and then produces inflection tables from unseen lemmas or base forms. We evaluate the approach on datasets covering 11 different languages and show that this approach results in consistently higher accuracies vis-a-vis other methods on the same task, thus indicating that the general method is a viable approach to quickly creating high-accuracy morphological resources.}, booktitle = {Proceedings of the 2015 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies}, author = {Ahlberg, Malin and Forsberg, Markus and Huldén, Måns}, year = {2015}, } @inProceedings{ahlberg-etal-2015-case-217988, title = {A case study on supervised classification of Swedish pseudo-coordination}, abstract = {We present a case study on supervised classification of Swedish pseudo-coordination (SPC). The classification is attempted on the type-level with data collected from two data sets: a blog corpus and a fiction corpus. Two small experiments were designed to evaluate the feasability of this task. The first experiment explored a classifier’s ability to discriminate pseudo-coordinations from ordinary verb coordinations, given a small labeled data set created during the experiment. The second experiment evaluated how well the classifier performed at detecting and ranking SPCs in a set of unlabeled verb coordinations, to investigate if it could be used as a semi-automatic discovery procedure to find new SPCs.}, booktitle = {Proceedings of the 20th Nordic Conference of Computational Linguistics, NODALIDA 2015, May 11-13, 2015, Vilnius, Lithuania}, author = {Ahlberg, Malin and Andersson, Peter and Forsberg, Markus and Tahmasebi, Nina}, year = {2015}, publisher = {Linköping University Electronic Press}, address = {Linköpings universitet}, ISBN = {978-91-7519-098-3}, } @inProceedings{borin-etal-2016-sparv-246053, title = {Sparv: Språkbanken’s corpus annotation pipeline infrastructure}, abstract = {Sparv is Språkbanken's corpus annotation pipeline infrastructure. The easiest way to use the pipeline is from its web interface with a plain text document. The pipeline uses in-house and external tools on the text to segment it into sentences and paragraphs, tokenise, tag parts-of-speech, look up in dictionaries and analyse compounds. The pipeline can also be run using a web API with XML results, and it is run locally at Språkbanken to prepare the documents in Korp, our corpus search tool. While the most sophisticated support is for modern Swedish, the pipeline supports 15 languages.}, booktitle = {SLTC 2016. The Sixth Swedish Language Technology Conference, Umeå University, 17-18 November, 2016}, author = {Borin, Lars and Forsberg, Markus and Hammarstedt, Martin and Rosén, Dan and Schäfer, Roland and Schumacher, Anne}, year = {2016}, } @inProceedings{cap-etal-2016-sword-254388, title = {SWORD: Towards Cutting-Edge Swedish Word Processing}, abstract = {Despite many years of research on Swedish language technology, there is still no well-documented standard for Swedish word processing covering the whole spectrum from low-level tokenization to morphological analysis and disambiguation. SWORD is a new initiative within the SWE-CLARIN consortium aiming to develop documented standards for Swedish word processing. In this paper, we report on a pilot study of Swedish tokenization, where we compare the output of six different tokenizers on four different text types. For one text type (Wikipedia articles), we also compare to the tokenization produced by six manual annotators.}, booktitle = {Proceedings of the Sixth Swedish Language Technology Conference (SLTC) Umeå University, 17-18 November, 2016}, author = {Cap, Fabienne and Adesam, Yvonne and Ahrenberg, Lars and Borin, Lars and Bouma, Gerlof and Forsberg, Markus and Kann, Viggo and Östling, Robert and Smith, Aaron and Wirén, Mats and Nivre, Joakim}, year = {2016}, } @inProceedings{kokkinakis-etal-2012-literacy-164587, title = {Literacy Demands and Information to Cancer Patients}, abstract = {This study examines language complexity of written health information materials for patients undergoing colorectal cancer surgery. Written and printed patient information from 28 Swedish clinics are automatically analyzed by means of language technology. The analysis reveals different problematic issues that might have impact on readability. The study is a first step, and part of a larger project about patients’ health information seeking behavior in relation to written information material. Our study aims to provide support for producing more individualized, person centered information materials according to preferences for complex and detailed or legible texts and thus enhance a movement from receiving information and instructions to participating in knowing. In the near future the study will continue by integrating focus groups with patients that may provide valuable feedback and enhance our knowledge about patients’ use and preferences of different information material.}, booktitle = {Proceedings of the 15th International Conference on Text, Speech and Dialogue}, author = {Kokkinakis, Dimitrios and Forsberg, Markus and Johansson Kokkinakis, Sofie and Smith, Frida and Öhlén, Joakim}, year = {2012}, ISBN = {978-364232789-6}, } @article{tahmasebi-etal-2015-visions-212969, title = {Visions and open challenges for a knowledge-based culturomics}, abstract = {The concept of culturomics was born out of the availability of massive amounts of textual data and the interest to make sense of cultural and language phenomena over time. Thus far however, culturomics has only made use of, and shown the great potential of, statistical methods. In this paper, we present a vision for a knowledge-based culturomics that complements traditional culturomics. We discuss the possibilities and challenges of combining knowledge-based methods with statistical methods and address major challenges that arise due to the nature of the data; diversity of sources, changes in language over time as well as temporal dynamics of information in general. We address all layers needed for knowledge-based culturomics, from natural language processing and relations to summaries and opinions.}, journal = {International Journal on Digital Libraries}, author = {Tahmasebi, Nina and Borin, Lars and Capannini, Gabriele and Dubhashi, Devdatt and Exner, Peter and Forsberg, Markus and Gossen, Gerhard and Johansson, Fredrik and Johansson, Richard and Kågebäck, Mikael and Mogren, Olof and Nugues, Pierre and Risse, Thomas}, year = {2015}, volume = {15}, number = {2-4}, pages = {169--187}, } @incollection{forsberg-hulden-2016-learning-240208, title = {Learning Transducer Models for Morphological Analysis from Example Inflections}, abstract = {In this paper, we present a method to convert morphological inflection tables into unweighted and weighted finite transducers that perform parsing and generation. These transducers model the inflectional behavior of morphological paradigms induced from examples and can map inflected forms of previously unseen word forms into their lemmas and give morphosyntactic descriptions of them. The system is evaluated on several languages with data collected from the Wiktionary.}, booktitle = {Proceedings of the SIGFSM Workshop on Statistical NLP and Weighted Automata. Association for Computational Linguistics. August 12, 2016 Berlin, Germany}, author = {Forsberg, Markus and Hulden, Mans}, year = {2016}, publisher = {ACL}, address = {Stroudsburg, PA, USA}, ISBN = {978-1-945626-13-5 }, pages = {42--50}, } @inProceedings{borin-etal-2013-mining-188846, title = {Mining semantics for culturomics: towards a knowledge-based approach}, abstract = {The massive amounts of text data made available through the Google Books digitization project have inspired a new field of big-data textual research. Named culturomics, this field has attracted the attention of a growing number of scholars over recent years. However, initial studies based on these data have been criticized for not referring to relevant work in linguistics and language technology. This paper provides some ideas, thoughts and first steps towards a new culturomics initiative, based this time on Swedish data, which pursues a more knowledge-based approach than previous work in this emerging field. The amount of new Swedish text produced daily and older texts being digitized in cultural heritage projects grows at an accelerating rate. These volumes of text being available in digital form have grown far beyond the capacity of human readers, leaving automated semantic processing of the texts as the only realistic option for accessing and using the information contained in them. The aim of our recently initiated research program is to advance the state of the art in language technology resources and methods for semantic processing of Big Swedish text and focus on the theoretical and methodological advancement of the state of the art in extracting and correlating information from large volumes of Swedish text using a combination of knowledge-based and statistical methods.}, booktitle = {2013 ACM International Workshop on Mining Unstructured Big Data Using Natural Language Processing, UnstructureNLP 2013, Held at 22nd ACM International Conference on Information and Knowledge Management, CIKM 2013; San Francisco, CA; United States; 28 October 2013 through 28 October 2013}, author = {Borin, Lars and Dubhashi, Devdatt and Forsberg, Markus and Johansson, Richard and Kokkinakis, Dimitrios and Nugues, Pierre}, year = {2013}, ISBN = {978-1-4503-2415-1}, pages = {3--10}, } @article{adesam-etal-2016-sprakteknologi-237884, title = {Språkteknologi för svenska språket genom tiderna}, abstract = {Språkbanken, the Swedish Language Bank, is a language technology research unit at the Department of Swedish, University of Gothenburg. We develop language resources – such as corpora, lexical resources, and analytical tools – for all variants of Swedish, from Old Swedish laws to present-day social media. Historical texts offer exciting theoretical and methodological challenges for language technology because they often defy the assumption inherent in most automatic analysis tools that the texts contain a standardized written language. In this article, we describe our ongoing work on the development of annotated historical corpora, as well as our efforts on linking various resources (both corpora and lexical resources). This research advances the state of the art of language technology as well as enables new research for scholars in other disciplines.}, journal = {Kungliga Skytteanska Samfundets Handlingar}, author = {Adesam, Yvonne and Ahlberg, Malin and Andersson, Peter and Borin, Lars and Bouma, Gerlof and Forsberg, Markus}, year = {2016}, volume = {76}, number = {Studier i svensk språkhistoria 13}, pages = {65--87}, } @article{forsberg-etal-2014-from-208123, title = {From construction candidates to constructicon entries: An experiment using semi-automatic methods for identifying constructions in corpora}, abstract = { We present an experiment where natural language processing tools are used to automatically identify potential constructions in a corpus. e experiment was conducted as part of the ongoing efforts to develop a Swedish constructicon. Using an automatic method to suggest constructions has advantages not only for efficiency but also methodologically: it forces the analyst to look more objec-tively at the constructions actually occurring in corpora, as opposed to focusing on “interesting” constructions only. As a heuristic for identifying potential con-structions, the method has proved successful, yielding about 200 (out of 1,200) highly relevant construction candidates.}, journal = {Constructions and Frames}, author = {Forsberg, Markus and Johansson, Richard and Bäckström, Linnéa and Borin, Lars and Lyngfelt, Benjamin and Olofsson, Joel and Prentice, Julia}, year = {2014}, volume = {6}, number = {1, 2014}, pages = {114--135}, } @misc{andersen-etal-2015-sibirientyska-215757, title = {Sibirientyska kvinnor (Siberian German women)}, abstract = {Siberian German women The corpus consists of dialogs between four women born in 1927 to 1937 in the Soviet Volga Republic. Their mother tongue is a German variety spoken in Russia since the second half of the 18th century. Since the end of the Second World War, the women have lived in the region of Krasnoyarsk. They talk about their backgrounds and their everyday lives in the village. The corpus consists of about 16 000 words. Russian words and hybrids are given in [brackets], the turns of the interviewers are in {brackets}; all verb forms have got the attribute FINIT or INFINIT. More information on the research project see Syntax in contact. }, author = {Andersen, Christiane and Forsberg, Markus and Hammarstedt, Martin and Pankow, Alexander}, year = {2015}, publisher = {University of Gothenburg}, address = {Göteborg}, } @inProceedings{ahlberg-etal-2014-swedish-210083, title = {Swedish FrameNet++ The Beginning of the End and the End of the Beginning}, booktitle = {Proceedings of the Fifth Swedish Language Technology Conference, Uppsala, 13-14 November 2014}, author = {Ahlberg, Malin and Borin, Lars and Dannélls, Dana and Forsberg, Markus and Toporowska Gronostaj, Maria and Friberg Heppin, Karin and Johansson, Richard and Kokkinakis, Dimitrios and Olsson, Leif-Jöran and Uppström, Jonatan}, year = {2014}, } @inProceedings{adesam-etal-2014-koala-211376, title = {Koala – Korp’s Linguistic Annotations Developing an infrastructure for text-based research with high-quality annotations}, booktitle = {Proceedings of the Fifth Swedish Language Technology Conference, Uppsala, 13-14 November 2014}, author = {Adesam, Yvonne and Borin, Lars and Bouma, Gerlof and Forsberg, Markus and Johansson, Richard}, year = {2014}, } @inProceedings{lyngfelt-etal-2014-svenskt-208457, title = {Ett svenskt konstruktikon. Grammatik möter lexikon}, booktitle = {Svenskans beskrivning : Förhandlingar vid Trettiotredje sammankomsten för svenskans beskrivning. Helsingfors den 15–17 maj 2013}, author = {Lyngfelt, Benjamin and Borin, Lars and Bäckström, Linnéa and Forsberg, Markus and Olsson, Leif-Jöran and Prentice, Julia and Rydstedt, Rudolf and Sköldberg, Emma and Tingsell, Sofia and Uppström, Jonatan}, year = {2014}, volume = {33}, ISBN = {978-951-51-0120-4}, pages = {268--279}, } @inProceedings{borin-etal-2014-representing-204731, title = {Representing Swedish Lexical Resources in RDF with lemon}, abstract = {The paper presents an ongoing project which aims to publish Swedish lexical-semantic resources using Semantic Web and Linked Data technologies. In this article, we highlight the practical conversion methods and challenges of converting three of the Swedish language resources in RDF with lemon.}, booktitle = { Proceedings of the ISWC 2014 Posters & Demonstrations Track a track within the 13th International Semantic Web Conference (ISWC 2014)}, author = {Borin, Lars and Dannélls, Dana and Forsberg, Markus and McCrae, John P.}, year = {2014}, volume = {1272 }, pages = {329--332}, } @inProceedings{adesam-etal-2014-computer-198794, title = {Computer-aided Morphology Expansion for Old Swedish}, abstract = {In this paper we describe and evaluate a tool for paradigm induction and lexicon extraction that has been applied to Old Swedish. The tool is semi-supervised and uses a small seed lexicon and unannotated corpora to derive full inflection tables for input lemmata. In the work presented here, the tool has been modified to deal with the rich spelling variation found in Old Swedish texts. We also present some initial experiments, which are the first steps towards creating a large-scale morphology for Old Swedish.}, booktitle = {Proceedings of the Ninth International Conference on Language Resources and Evaluation (LREC'14) May 26-31, 2014 Reykjavik, Iceland }, author = {Adesam, Yvonne and Ahlberg, Malin and Andersson, Peter and Bouma, Gerlof and Forsberg, Markus and Hulden, Mans}, year = {2014}, ISBN = { 978-2-9517408-8-4}, pages = {1102--1105}, } @inProceedings{ahlberg-etal-2014-semi-198791, title = {Semi-supervised learning of morphological paradigms and lexicons}, abstract = {We present a semi-supervised approach to the problem of paradigm induction from inflection tables. Our system extracts generalizations from inflection tables, representing the resulting paradigms in an abstract form. The process is intended to be language-independent, and to provide human-readable generalizations of paradigms. The tools we provide can be used by linguists for the rapid creation of lexical resources. We evaluate the system through an inflection table reconstruction task using Wiktionary data for German, Spanish, and Finnish. With no additional corpus information available, the evaluation yields per word form accuracy scores on inflecting unseen base forms in different lan guages ranging from 87.81% (German nouns) to 99.52% (Spanish verbs); with additional unlabeled tex t corpora available for training the scores range from 91.81% (German nouns) to 99.58% (Spanish verbs). We separately evaluate the system in a simulated task of Swedish lexicon creation, and show that on the basis of a small number of inflection tables, the system can accurately collect from a list of noun forms a lexicon with inflection information ranging from 100.0% correct (collect 100 words), to 96.4% correct (collect 1000 words).}, booktitle = {Proceedings of the 14th Conference of the European Chapter of the Association for Computational Linguistics, Gothenburg, Sweden 26–30 April 2014 }, author = {Ahlberg, Malin and Forsberg, Markus and Hulden, Mans}, year = {2014}, ISBN = {978-1-937284-78-7}, pages = {569--578}, } @article{borin-etal-2013-saldo-188604, title = {SALDO: a touch of yin to WordNet's yang}, abstract = {The English-language Princeton WordNet (PWN) and some wordnets for other languages have been extensively used as lexical–semantic knowledge sources in language technology applications, due to their free availability and their size. The ubiquitousness of PWN-type wordnets tends to overshadow the fact that they represent one out of many possible choices for structuring a lexical-semantic resource, and it could be enlightening to look at a differently structured resource both from the point of view of theoretical–methodological considerations and from the point of view of practical text processing requirements. The resource described here—SALDO—is such a lexical–semantic resource, intended primarily for use in language technology applications, and offering an alternative organization to PWN- style wordnets. We present our work on SALDO, compare it with PWN, and discuss some implications of the differences. We also describe an integrated infrastructure for computational lexical resources where SALDO forms the central component.}, journal = {Language resources and evaluation}, author = {Borin, Lars and Forsberg, Markus and Lönngren, Lennart}, year = {2013}, volume = {47}, number = {4}, pages = {1191--1211}, } @inProceedings{borin-forsberg-2014-swesaurus;-193085, title = {Swesaurus; or, The Frankenstein Approach to Wordnet Construction}, abstract = {Swesaurus is a freely available (under a CC-BY license) Swedish wordnet under construction, built primarily by scavenging and recycling information from a number of existing lexical resources. Among its more unusual characteristics are graded lexical-semantic relations and inclusion of all parts of speech, not only open-class items. }, booktitle = {Proceedings of the Seventh Global WordNet Conference (GWC 2014)}, author = {Borin, Lars and Forsberg, Markus}, year = {2014}, ISBN = {978-9949-32-492-7}, } @inProceedings{skoldberg-etal-2013-between-186041, title = {Between Grammars and Dictionaries: a Swedish Constructicon }, abstract = {This paper introduces the Swedish Constructicon (SweCxn), a database of Swedish constructions currently under development. We also present a small study of the treatment of constructions in Swedish (paper) dictionaries, thus illustrating the need for a constructionist approach, and discuss three different methods used to identify potential constructions for inclusion in the constructicon. SweCxn is a freely available electronic resource, with a particular focus on semi-general linguistic patterns of the type that are difficult to account for from a purely lexicographic or a purely grammatical perspective, and which therefore have tended to be neglected in both dictionaries and grammars. Far from being a small set of borderline cases, such constructions are both numerous and common. They are also quite problematic for second language acquisition as well as LT applications. Accordingly, various kinds of multi-word units have received more attention in recent years, not least from a lexicographic perspective. The coverage, however, is only partial, and the productivity of many constructions is hard to capture from a lexical viewpoint. To identify constructions for SweCxn, we use a combination of methods, such as working from existing construction descriptions for Swedish and other languages, applying LT tools to discover recurring patterns in texts, and extrapolating constructional information from dictionaries. }, booktitle = {Kosem, I., Kallas, J., Gantar, P., Krek, S., Langemets, M., Tuulik, M. (eds.) 2013. Electronic lexicography in the 21st century: thinking outside the paper. Proceedings of the eLex 2013 conference, 17-19 October 2013, Tallinn, Estonia. Ljubljana/Tallinn: Trojina, Institute for Applied Slovene Studies/Eesti Keele Instituut.}, author = {Sköldberg, Emma and Bäckström, Linnéa and Borin, Lars and Forsberg, Markus and Lyngfelt, Benjamin and Olsson, Leif-Jöran and Prentice, Julia and Rydstedt, Rudolf and Tingsell, Sofia and Uppström, Jonatan}, year = {2013}, pages = {310--327}, } @edited_book{borin-etal-2013-proceedings-190260, title = {Proceedings of the workshop on lexical semantic resources for NLP at NODALIDA 2013, May 22-24, 2013, Oslo, Norway}, editor = {Borin, Lars and Fjeld, Ruth Vatvedt and Forsberg, Markus and Nimb, Sanni and Nugues, Pierre and Pedersen, Bolette Sandford}, year = {2013}, publisher = {Linköping University Electronic Press}, address = {Linköping}, ISBN = {978-91-7519-586-5}, } @inProceedings{borin-etal-2012-open-156079, title = {The open lexical infrastructure of Språkbanken}, abstract = {We present our ongoing work on Karp, Språkbanken’s (the Swedish Language Bank) open lexical infrastructure, which has two main functions: (1) to support the work on creating, curating, and integrating our various lexical resources; and (2) to publish daily versions of the resources, making them searchable and downloadable. An important requirement on the lexical infrastructure is also that we maintain a strong bidirectional connection to our corpus infrastructure. At the heart of the infrastructure is the SweFN++ project with the goal to create free Swedish lexical resources geared towards language technology applications. The infrastructure currently hosts 15 Swedish lexical resources, including historical ones, some of which have been created from scratch using existing free resources, both external and in-house. The resources are integrated through links to a pivot lexical resource, SALDO, a large morphological and lexical-semantic resource for modern Swedish. SALDO has been selected as the pivot partly because of its size and quality, but also because its form and sense units have been assigned persistent identifiers (PIDs) to which the lexical information in other lexical resources and in corpora are linked.}, booktitle = {Proceedings of the 8th International Conference on Language Resources and Evaluation : May 23-25, 2012 / eds. Nicoletta Calzolari }, author = {Borin, Lars and Forsberg, Markus and Olsson, Leif-Jöran and Uppström, Jonatan}, year = {2012}, ISBN = {978-2-9517408-7-7}, pages = {3598--3602}, } @article{borin-etal-2013-close-187063, title = {Close encounters of the fifth kind: Some linguistic and computational aspects of the Swedish FrameNet++ project}, abstract = {The Swedish FrameNet++ (SweFN++) project aims at developing an integrated Swedish lexical macro-resource to be used primarily in language technology R&D to build natural language processing (NLP) applications. Most of the component resources making up SweFN++ are existing digital lexical resources; in their case the central project effort is directed at making them interoperable on as many levels as possible. An important new resource being created in the project is a Swedish framenet. Now a sister project is starting with the aim of adding a Swedish constructicon (SweCxn) to the macro-resource. In this paper, we discuss some theoretical and conceptual issues which have arisen in the course of our work on the SweFN++ and the planning of the SweCxn, in the close encounter between the practical requirements of NLP and the theory and practice of linguistic – lexical and grammatical – description. }, journal = {Veredas}, author = {Borin, Lars and Forsberg, Markus and Lyngfelt, Benjamin}, year = {2013}, volume = {17}, number = {1}, pages = {28--43}, } @inProceedings{borin-etal-2012-korp-156080, title = {Korp – the corpus infrastructure of Språkbanken}, abstract = {We present Korp, the corpus infrastructure of Språkbanken (the Swedish Language Bank). The infrastructure consists of three main components: the Korp corpus pipeline, the Korp backend, and the Korp frontend. The Korp corpus pipeline is used for importing corpora, annotating them, and then exporting the annotated corpora into different formats. An essential feature of the pipeline is the ability to leave existing annotations untouched, both structural and word level annotations, and to use the existing annotations as the foundation of other annotations. The Korp backend consists of a set of REST-based web services for searching in and retrieving information about the corpora. Finally, the Korp frontend is a graphical search interface that interacts with the Korp backend. The interface has been inspired by corpus search interfaces such as SketchEngine, Glossa, and DeepDict, and it uses State Chart XML (SCXML) in order to enable users to bookmark interaction states. We give a functional and technical overview of the three components, followed by a discussion of planned future work. }, booktitle = {Proceedings of LREC 2012. Istanbul: ELRA}, author = {Borin, Lars and Forsberg, Markus and Roxendal, Johan}, year = {2012}, volume = {Accepted}, pages = {474–478}, } @misc{ranta-forsberg-2012-implementing-168685, title = {Implementing Programming Languages}, author = {Ranta, Aarne and Forsberg, Markus}, year = {2012}, publisher = {College Publications}, address = {London}, ISBN = {978-1-84890-064-6}, } @inProceedings{vasiljevs-etal-2012-creation-156083, title = {Creation of an Open Shared Language Resource Repository in the Nordic and Baltic Countries}, abstract = {The META-NORD project has contributed to an open infrastructure for language resources (data and tools) under the META-NET umbrella. This paper presents the key objectives of META-NORD and reports on the results achieved in the first year of the project. META-NORD has mapped and described the national language technology landscape in the Nordic and Baltic countries in terms of language use, language technology and resources, main actors in the academy, industry, government and society; identified and collected the first batch of language resources in the Nordic and Baltic countries; documented, processed, linked, and upgraded the identified language resources to agreed standards and guidelines. The three horizontal multilingual actions in META-NORD are overviewed in this paper: linking and validating Nordic and Baltic wordnets, the harmonisation of multilingual Nordic and Baltic treebanks, and consolidating multilingual terminology resources across European countries. This paper also touches upon intellectual property rights for the sharing of language resources. }, booktitle = {Proceedings of LREC 2012}, author = {Vasiļjevs, Andrejs and Forsberg, Markus and Gornostay, Tatiana and Hansen, Dorte H. and Jóhannsdóttir, Kristín M. and Lindén, Krister and Lyse, Gunn I. and Offersgaard, Lene and Oksanen, Ville and Olsen, Sussi and Pedersen, Bolette S. and Rögnvaldsson, Eiríkur and Rozis, Roberts and Skadiņa, Inguna and Smedt, Koenraad De}, year = {2012}, ISBN = {978-2-9517408-7-7}, } @inProceedings{pedersen-etal-2013-nordic-178357, title = {Nordic and Baltic wordnets aligned and compared through “WordTies”}, abstract = {During the last few years, extensive wordnets have been built locally for the Nordic and Baltic languages applying very different compilation strategies. The aim of the present investigation is to consolidate and examine these wordnets through an alignment via Princeton Core WordNet and thereby compare them along the measures of taxonomical structure, synonym structure, and assigned relations to approximate to a best practice. A common web interface and visualizer “WordTies” is developed to facilitate this purpose. Four bilingual wordnets are automatically processed and evaluated exposing interesting differences between the wordnets. Even if the alignments are judged to be of a good quality, the precision of the translations vary due to considerable differences in hyponymy depth and interpretation of the synset. All seven monolingual and four bilingual wordnets as well as WordTies have been made available via META-SHARE through the META-NORD project.}, booktitle = {Proceedings of the 19th Nordic Conference of Computational Linguistics (NODALIDA 2013), May 22–24, 2013, Oslo University, Norway. NEALT Proceedings Series 16}, author = {Pedersen, Bolette and Borin, Lars and Forsberg, Markus and Kahusk, Neeme and Lindén, Krister and Niemi, Jyrki and Nisbeth, Niklas and Nygaard, Lars and Orav, Heili and Rögnvaldsson, Eiríkur and Seaton, Mitchel and Vider, Kadri and Voionmaa, Kaarlo}, year = {2013}, number = {16}, pages = {147--162}, } @inProceedings{backstrom-etal-2013-automatic-178351, title = {Automatic identification of construction candidates for a Swedish constructicon}, abstract = {We present an experiment designed for extracting construction candidates for a Swedish constructicon from text corpora. We have explored the use of hybrid n-grams with the practical goal to discover previously undescribed partially schematic constructions. The experiment was successful, in that quite a few new constructions were discovered. The precision is low, but as a push-button tool for construction discovery, it has proven a valuable tool for the work on a Swedish constructicon.}, booktitle = {Proceedings of the workshop on lexical semantic resources for NLP at NODALIDA 2013, May 22-24, 2013, Oslo, Norway. NEALT Proceedings Series 19}, author = {Bäckström, Linnéa and Borin, Lars and Forsberg, Markus and Lyngfelt, Benjamin and Prentice, Julia and Sköldberg, Emma}, year = {2013}, pages = {2--11}, } @inProceedings{borin-etal-2012-growing-171988, title = {Growing a Swedish constructicon in lexical soil}, booktitle = {Proceedings of the Swedish Language Technology Conference. Lund, October 24-26, 2012}, author = {Borin, Lars and Forsberg, Markus and Lyngfelt, Benjamin and Prentice, Julia and Rydstedt, Rudolf and Sköldberg, Emma and Tingsell, Sofia}, year = {2012}, pages = {10--11}, } @inProceedings{smith-etal-2012-forbattra-170895, title = {Hur kan vi förbättra skriftligt informations- och utbildningsmaterial för patienter som opereras elektivt för kolorektal cancer?}, abstract = {Kolorektal cancer (KRC) är den tredje största cancerdiagnosen i Sverige med drygt 5500 drabbade årligen. Primär behandling är kirurgi kompletterad av pre- och postoperativ onkologisk behandling. Standardiserade koncept för accelererat vårdförlopp med kortare vårdtider lägger mycket fokus på fysisk rehabilitering, men mindre på den psykiska påfrestning det innebär att bli opererad för en cancerdiagnos. Patienter förväntas ta stort ansvar för sin rehabilitering, både på sjukhuset och hemma. För att vara förberedd behövs både skriftlig och muntlig information. Syftet med studien var att kartlägga och karaktärisera det skriftliga informations- och utbildningsmaterial (IOU) som används till patienter som opereras elektivt för KRC. Vidare var syftet att beskriva patienters uppfattning om struktur och innehåll på IOU. IOU från 28 kliniker som opererar patienter med KRC samlades in (totalt 220 st). För att kunna ge ett mått på texternas svårighetsgrad gjordes språkteknologisk analys på samtliga IOU, där bl.a. ordlängd, meningsbyggnad och jämförelse med annan typ av litteratur mättes På 117 st gjordes en suitabilityanalys med instrumentet SAM+CAM där domän som innehåll, läsbarhet, bilder, layout samt stimulans och motivation för lärande bedömdes. Fem fokusgrupper med patienter genomfördes där patienterna uppmanades att berätta om vad de tycker utmärker ett bra respektive dåligt IOU, vad de saknar i innehåll och när och på vilket sätt de vill ha materialet utlämnat. Resultatet av språkteknologiska- och suitabilityanalysen visar att de flesta IOU bedömdes som ”adequate”, men spridningen var stor. Patienterna hade önskemål om mer nivåuppdelat/nivåriktat material, där man själv kan välja hur mycket information man vill ha vid ett visst tillfälle. Flera ämnen saknades, eller var för otydligt beskrivna för att patienterna skulle känna sig trygga vid hemgång. Resultatet av de tre analysmetoderna bör kunna användas för att utveckla en ”verktygslåda” för att i framtiden kunna utforma bättre riktat IOU för patientgruppen. }, booktitle = {Nationella konferensen i Cancervård, 24-25 maj 2012, Stockholm}, author = {Smith, Frida and Öhlén, Joakim and Carlsson, Eva and Forsberg, Markus and Kokkinakis, Dimitrios and Friberg, Febe}, year = {2012}, } @incollection{borin-etal-2008-hunting-72504, title = {The hunting of the BLARK - SALDO, a freely available lexical database for Swedish language technology}, booktitle = {Resourceful language technology. Festschrift in honor of Anna Sågvall Hein}, author = {Borin, Lars and Forsberg, Markus and Lönngren, Lennart}, year = {2008}, publisher = {Uppsala University}, address = {Uppsala}, pages = {21--32}, } @inProceedings{lyngfelt-etal-2012-adding-163582, title = {Adding a constructicon to the Swedish resource network of Språkbanken}, abstract = {This paper presents the integrated Swedish resource network of Språkbanken in general, and its latest addition – a constructicon – in particular. The constructicon, which is still in its early stages, is a collection of (partially) schematic multi-word units, constructions, developed as an addition to the Swedish FrameNet (SweFN). SweFN and the constructicon are integrated with other parts of Språkbanken, both lexical resources and corpora, through the lexical resource SALDO. In most respects, the constructicon is modeled on its English counterpart in Berkeley, and, thus, following the FrameNet format. The most striking differencies are the inclusion of so-called collostructional elements and the treatment of semantic roles, which are defined globally instead of locally as in FrameNet. Incorporating subprojects such as developing methods for automatic identification of constructions in authentic text on the one hand, and accounting for constructions problematic for L2 acquisition on the other, the approach is highly cross-disciplinary in nature, combining various theoretical linguistic perspectives on construction grammar with language technology, lexicography, and L2 research.}, booktitle = {11th Conference on Natural Language Processing (KONVENS) Proceedings}, author = {Lyngfelt, Benjamin and Borin, Lars and Forsberg, Markus and Prentice, Julia and Rydstedt, Rudolf and Sköldberg, Emma and Tingsell, Sofia}, year = {2012}, ISBN = {3-85027-005-X}, pages = {452--461}, } @misc{andersen-forsberg-2012-sibirientyska-162958, title = {Sibirientyska}, abstract = {German in Siberia are transcriptions of German spoken in the region of Krasnoyarsk (Russia). The corpus contains about 34 000 running words. Codeswitching to Russian and verb forms are annotated (Russian word forms in brackets like [vot], finite verb forms (FINIT), infinite verb forms (INFIN)). The transcription and annotation of the corpus have been established in collaboration with the Astafyev University Krasnoyarsk. The corpus is a part of a research project at the University of Gothenburg, see http://www.sprak.gu.se/kontakta-oss/larare/andersen-christiane/syntax-in-contact/ The data base is currently in the test phase. }, author = {Andersen, Christiane and Forsberg, Markus}, year = {2012}, publisher = {University of Gothenburg}, address = {Göteborg}, } @inProceedings{borin-etal-2012-transferring-157213, title = {Transferring Frames: Utilization of Linked Lexical Resources}, abstract = {In our experiment, we evaluate the transferability of frames from Swedish to Finnish in parallel corpora. We evaluate both the theoretical possibility of transferring frames and the possibility of performing it using available lexical resources. We add the frame information to an extract of the Swedish side of the Kotus and JRC-Acquis corpora using an automatic frame labeler and copy it to the Finnish side. We focus on evaluating the results to get an estimation on how often the parallel sentences can be said to express the same frame. This sheds light to the questions: Are the same situations in the two languages expressed using different frames, i.e. are the frames transferable even in theory? How well can the frame information of running text be transferred from language to another? }, booktitle = {Proceedings of the Workshop on Inducing Linguistic Structure Submission (WILS)}, author = {Borin, Lars and Forsberg, Markus and Johansson, Richard and Muhonen, Kristiina and Purtonen, Tanja and Voionmaa, Kaarlo}, year = {2012}, pages = {8--15}, } @inProceedings{borin-etal-2012-search-157338, title = {Search Result Diversification Methods to Assist Lexicographers}, abstract = {We show how the lexicographic task of finding informative and diverse example sentences can be cast as a search result diversification problem, where an objective based on relevance and diversity is maximized. This problem has been studied intensively in the information retrieval community during recent years, and efficient algorithms have been devised. We finally show how the approach has been implemented in a lexicographic project, and describe the relevance and diversity functions used in that context. }, booktitle = {Proceedings of the 6th Linguistic Annotation Workshop}, author = {Borin, Lars and Forsberg, Markus and Friberg Heppin, Karin and Johansson, Richard and Kjellandsson, Annika}, year = {2012}, pages = {113--117}, } @techreport{lyngfelt-forsberg-2012-svenskt-158226, title = {Ett svenskt konstruktikon. Utgångspunkter och preliminära ramar}, author = {Lyngfelt, Benjamin and Forsberg, Markus}, year = {2012}, publisher = {University of Gothenburg}, address = {Göteborg}, } @inProceedings{pedersen-etal-2012-linking-155599, title = {Linking and validating Nordic and Baltic wordnets}, booktitle = {Proceedings of the 6th International Global Wordnet Conference}, author = {Pedersen, Bolette Sandford and Borin, Lars and Forsberg, Markus and Lindén, Krister and Orav, Heili and Rögnvaldsson, Eírikur}, year = {2012}, volume = {Accepted}, pages = {254--260}, } @inProceedings{forsberg-lager-2012-cloud-156078, title = {Cloud Logic Programming for Integrating Language Technology Resources}, abstract = {The main goal of the CLT Cloud project is to equip lexica, morphological processors, parsers and other software components developed within CLT (Centre of Language Technology) with so called web API:s, thus making them available on the Internet in the form of web services. We present a proof-of-concept implementation of the CLT Cloud server where we use the logic programming language Prolog for composing and aggregating existing web services into new web services in a way that encourages creative exploration and rapid prototyping of LT applications. }, booktitle = {Proceedings of LREC 2012}, author = {Forsberg, Markus and Lager, Torbjörn}, year = {2012}, volume = {Accepted}, } @inProceedings{borin-etal-2011-semantic-140686, title = {Semantic Search in Literature as an e-Humanities Research Tool: CONPLISIT – Consumption Patterns and Life-Style in 19th Century Swedish Literature}, abstract = {We present our ongoing work on language technology-based e-science in the humanities, with a focus on text-based research in the historical sciences. Currently, we are working on the adaptation and integration of lexical resources representing different historical stages of Swedish into a lexical and morphological toolbox that will allow us to develop semantically oriented text search applications for historical research on Swedish text. We describe a semantic search prototype which was built using REST web services from this toolbox as components, and which has been evaluated by historians interested in using digitized 19th century novels as primary data for an historical investigation of the emerging consumer society in 19th century Sweden.}, booktitle = {NEALT Proceedings Series (NODALIDA 2011 Conference Proceedings)}, author = {Borin, Lars and Forsberg, Markus and Ahlberger, Christer}, year = {2011}, volume = {11}, pages = {58--65}, } @inProceedings{smith-etal-2011-developing-152723, title = {Developing a toolkit for written information materials for patients with colorectal cancer undergoing elective surgery}, abstract = {This study examines language complexity, readability and suitability, of written health information materials given to patients undergoing colorectal cancer (CRC) surgery. The overall aim is to investigate whether the implementation of adapted, person-centred information and communication for patients with CRC undergoing elective surgery, can enhance the patients’ self-care beliefs and well-being during recovery in the phase following diagnosis and initial treatment. Several explorative, qualitative studies are planned and will function both as a basis for the proposed interventions and provide explanations for the actual processes leading to the desired outcomes. Patients’ knowledge enablement will be reached by several interrelated intervention strategies and specific activities. One of these strategies deals with means to facilitate patients’ information seeking patterns and the goal is to provide patients with written information materials according to preferences for complex and detailed or legible texts. Thus, the interventions planed aim to enhance a movement from receiving information and instructions to participating in knowing. Written and printed patient information material from 28 Swedish clinics for patients diagnosed with CRC undergoing elective surgery were selected for analysis by means of standard metrics and more elaborate language technology techniques. Various text parameters such as lexical variation, frequency bands and the use of terminology were examined. The material was also analysed using a Suitability Assessment Instrument in order to examine content, literacy demand, graphic illustrations, layout and typography, learning stimulation and finally cultural appropriateness. In addition, five focusgroups were conducted where patients were asked to give their experiences of using written information. Results from the language technology analysis showed a variety in materials, where it could be divided in to easy, medium and difficult to read and comprehend. Patients in focusgroups told they would like written materials to be levelled in order to gain information stepwise, but also stressed the importance of information given both orally and in writing, and that they must correspond. Using the SAM-instrument was a good complement for deeper understanding, and taking all three analyses in account, we aim to design a balanced toolkit for how to best design written information materials where a person tailored approach can be offered. }, booktitle = {Svenska Läkaresällskapets Riksstämman}, author = {Smith, Frida and Carlsson, Eva and Friberg, Febe and Kokkinakis, Dimitrios and Forsberg, Markus and Öhrn, Matilda and Öhlén, Joakim}, year = {2011}, } @article{borin-forsberg-2011-swesaurus-151331, title = {Swesaurus – ett svenskt ordnät med fria tyglar}, journal = {LexicoNordica}, author = {Borin, Lars and Forsberg, Markus}, year = {2011}, volume = {18}, pages = {17--39}, } @incollection{borin-forsberg-2011-diachronic-144291, title = {A diachronic computational lexical resource for 800 years of Swedish}, booktitle = {Language technology for cultural heritage}, author = {Borin, Lars and Forsberg, Markus}, year = {2011}, publisher = {Springer}, address = {Berlin}, ISBN = {978-3-642-20226-1}, pages = {41--61}, } @inProceedings{borin-etal-2010-past-110368, title = {The past meets the present in Swedish FrameNet++}, abstract = {The paper is about a recently initiated project which aims at the development of a Swedish FrameNet as an integral part of a larger lexical resource, hence the name “Swedish FrameNet++” (SweFN++). It focuses on reuse of free electronic resources and their role in the acquisition and population of Swedish frames. After a brief overview of Swedish resources, we reflect on three approaches to recycling the available lexical data in a semi-automatic manner. SweFN++ will be a multi-functional resource supporting research within lexicology and linguistics as well as different applications within computational lexicography and language technology, not to mention e-science.}, booktitle = {14th EURALEX International Congress}, author = {Borin, Lars and Dannélls, Dana and Forsberg, Markus and Toporowska Gronostaj, Maria and Kokkinakis, Dimitrios}, year = {2010}, pages = {269--281}, } @article{forsberg-2011-green-140694, title = {Green resources in plain sight: opening up the SweFN++ project}, abstract = {SweFN++ is a project focused on the cre- ation and curation of Swedish lexical re- sources geared towards language technol- ogy applications. An important theme of the project is openness and its realization as a lexical infrastructure. We give a short overview of the project, elaborate on what we mean by openness, and present the current state of the lexical infrastructure. }, journal = {Proceedings of the Nodalida 2011 Workshop on visibility and availability of LT resources}, author = {Forsberg, Markus}, year = {2011}, } @inProceedings{andreasson-etal-2009-swedish-102211, title = {Swedish CLARIN activities}, booktitle = {Proceedings of the Nodalida 2009 workshop on CLARIN activities in the Nordic countries. NEALT Proceedings Series}, author = {Andréasson, Maia and Borin, Lars and Forsberg, Markus and Beskow, Jonas and Carlson, Rolf and Edlund, Jens and Elenius, Kjell and Hellmer, Kahl and House, David and Merkel, Magnus and Forsbom, Eva and Megyesi, Beáta and Eriksson, Anders and Strömqvist, Sven}, year = {2009}, volume = {5}, pages = {1--5}, } @article{borin-etal-2010-swedish-129126, title = {Swedish FrameNet++}, journal = {Swedish Language Technology Conference 2010}, author = {Borin, Lars and Dannélls, Dana and Forsberg, Markus and Toporowska Gronostaj, Maria and Kokkinakis, Dimitrios}, year = {2010}, } @article{borin-forsberg-2010-beyond-129125, title = {Beyond the synset: Swesaurus – a fuzzy Swedish wordnet}, journal = {Re-thinking synonymy: semantic sameness and similarity in languages and their description}, author = {Borin, Lars and Forsberg, Markus}, year = {2010}, } @inProceedings{borin-forsberg-2010-from-118908, title = {From the People’s Synonym Dictionary to fuzzy synsets - first steps}, booktitle = {Proceedings of the LREC 2010 workshop Semantic relations. Theory and Applications}, author = {Borin, Lars and Forsberg, Markus}, year = {2010}, pages = {18--25}, } @inProceedings{borin-etal-2010-diabase-118907, title = {Diabase: Towards a diachronic BLARK in support of historical studies}, booktitle = {Proceedings of LREC 2010}, author = {Borin, Lars and Forsberg, Markus and Kokkinakis, Dimitrios}, year = {2010}, } @inProceedings{borin-etal-2009-thinking-110343, title = {Thinking Green: Toward Swedish FrameNet++}, abstract = {Access to multi-layered lexical, grammatical and semantic information representing text content is a prerequisite for efficient automatic understanding and generation of natural language. A FrameNet is considered a valuable resource for both linguistics and language technology research that may contribute to the achievement of these goals. Currently, FrameNet-like resources exist for a few languages,1 including some domain-specific and multilingual initiatives (Dolbey et al., 2006; Boas, 2009; Uematsu et al., 2009; Venturi et al., 2009), but are unavailable for most languages, including Swedish, although there have been some pilot studies exploring the semi-automatic acquisition of Swedish frames (Johansson & Nugues, 2006; Borin et al., 2007). At the University of Gothenburg, we are now embarking on a project to build a Swedish FrameNet-like resource. A novel feature of this project is that the Swedish FrameNetwill be an integral part of a largermany-faceted lexical resource. Hence the name Swedish FrameNet++ (SweFN++). }, booktitle = {FrameNet Masterclass and Workshop}, author = {Borin, Lars and Dannélls, Dana and Forsberg, Markus and Toporowska Gronostaj, Maria and Kokkinakis, Dimitrios}, year = {2009}, } @inProceedings{borin-forsberg-2008-something-72502, title = {Something old, something new: A computational morphological description of Old Swedish}, booktitle = {LREC 2008 Workshop on Language Technology for Cultural Heritage Data (LaTeCH 2008)}, author = {Borin, Lars and Forsberg, Markus}, year = {2008}, pages = {9--16}, } @inProceedings{borin-forsberg-2009-family-102212, title = {All in the family: A comparison of SALDO and WordNet}, booktitle = {Proceedings of the Nodalida 2009 Workshop on WordNets and other Lexical Semantic Resources - between Lexical Semantics, Lexicography, Terminology and Formal Ontologies. NEALT Proceedings Series}, author = {Borin, Lars and Forsberg, Markus}, year = {2009}, volume = {7}, } @article{borin-etal-2008-saldo-110525, title = {SALDO 1.0 (Svenskt associationslexikon version 2)}, journal = {Språkbanken, Göteborg universitet}, author = {Borin, Lars and Forsberg, Markus and Lönngren, Lennart}, year = {2008}, }