@inProceedings{r?dveneide-etal-2023-unsc-338176, title = {The UNSC-Graph: An Extensible Knowledge Graph for the UNSC Corpus}, abstract = {We introduce the UNSC-Graph, a knowledge graph for a corpus of debates of the United Nations Security Council (UNSC) during the period 1995-2020. The graph combines previously disconnected data sources including from the UNSC Repertoire, the UN Library, Wikidata, and from metadata extracted from the speeches themselves. Beyond existing metadata detailing debates’ topics and participants, we also extended the graph to include all country mentions in a speech, geographical neighbours of countries mentioned, as well as sentiment scores. By linking the graph to Wikidata, we are able to include additional geopolitical information and extract various country name aliases to extend the coverage of country mentions beyond existing NER-based approaches. Studying mentions of Ukraine after 2014, we present a use case for the graph as a source for continuous analysis of international politics and geopolitical events discussed in the UNSC.}, booktitle = {Computational linguistics - Association for Computational Linguistics}, author = {Rødven-Eide, Stian and Zaczynska, Karolina and Pires, Antonio and Patz, Ronny and Stede, Manfred}, year = {2023}, publisher = {Association for Computational Lingustics}, address = {Ingolstadt, Germany}, } @incollection{lindahl-r?dveneide-2022-argumentative-325260, title = {Argumentative Language Resources at Språkbanken Text}, abstract = {Språkbanken Text at the University of Gothenburg is a CLARIN B-centre providing language resources in Swedish, as well as tools to use them, for a wide range of disciplines. In 2017, we began exploring the field of argument mining – the process of automatically identifying and classifying arguments in text – partly aimed at establishing language resources and tools for argument analysis and mining in Swedish.}, booktitle = {CLARIN: The Infrastructure for Language Resources, eds. Darja Fišer & Andreas Witt}, author = {Lindahl, Anna and Rødven-Eide, Stian }, year = {2022}, publisher = {De Gruyter}, address = {Berlin, Boston}, ISBN = { 9783110767346 }, pages = {667--690}, } @inProceedings{rouces-etal-2018-generating-264719, title = {Generating a Gold Standard for a Swedish Sentiment Lexicon}, abstract = {We create a gold standard for sentiment annotation of Swedish terms, using the freely available SALDO lexicon and the Gigaword corpus. For this purpose, we employ a multi-stage approach combining corpus-based frequency sampling, direct score annotation and Best-Worst Scaling. In addition to obtaining a gold standard, we analyze the data from our process and we draw conclusions about the optimal sentiment model.}, booktitle = {LREC 2018, Eleventh International Conference on Language Resources and Evaluation, May 7-12, 2018, Miyazaki (Japan)}, author = {Rouces, Jacobo and Tahmasebi, Nina and Borin, Lars and Rødven-Eide, Stian }, year = {2018}, publisher = {ELRA}, address = {Miyazaki}, ISBN = {979-10-95546-00-9}, } @inProceedings{rouces-etal-2018-sensaldo-264720, title = {SenSALDO: Creating a Sentiment Lexicon for Swedish}, abstract = {The natural language processing subfield known as sentiment analysis or opinion mining has seen an explosive expansion over the last decade or so, and sentiment analysis has become a standard item in the NLP toolbox. Still, many theoretical and methodological questions remain unanswered and resource gaps unfilled. Most work on automated sentiment analysis has been done on English and a few other languages; for most written languages of the world, this tool is not available. This paper describes the development of an extensive sentiment lexicon for written (standard) Swedish. We investigate different methods for developing a sentiment lexicon for Swedish. We use an existing gold standard dataset for training and testing. For each word sense from the SALDO Swedish lexicon, we assign a real value sentiment score in the range [-1,1] and produce a sentiment label. We implement and evaluate three methods: a graph-based method that iterates over the SALDO structure, a method based on random paths over the SALDO structure and a corpus-driven method based on word embeddings. The resulting sense-disambiguated sentiment lexicon (SenSALDO) is an open source resource and freely available from Språkbanken, The Swedish Language Bank at the University of Gothenburg.}, booktitle = {LREC 2018, Eleventh International Conference on Language Resources and Evaluation, 7-12 May 2018, Miyazaki (Japan)}, author = {Rouces, Jacobo and Tahmasebi, Nina and Borin, Lars and Rødven-Eide, Stian }, year = {2018}, publisher = {ELRA}, address = {Miyazaki}, ISBN = {979-10-95546-00-9}, } @inProceedings{rouces-etal-2018-defining-264721, title = {Defining a gold standard for a Swedish sentiment lexicon: Towards higher-yield text mining in the digital humanities}, abstract = {There is an increasing demand for multilingual sentiment analysis, and most work on sentiment lexicons is still carried out based on English lexicons like WordNet. In addition, many of the non-English sentiment lexicons that do exist have been compiled by (machine) translation from English resources, thereby arguably obscuring possible language-specific characteristics of sentiment-loaded vocabulary. In this paper we describe the creation from scratch of a gold standard for the sentiment annotation of Swedish terms as a first step towards the creation of a full-fledged sentiment lexicon for Swedish.}, booktitle = {CEUR Workshop Proceedings vol. 2084. Proceedings of the Digital Humanities in the Nordic Countries 3rd Conference Helsinki, Finland, March 7-9, 2018. Edited by Eetu Mäkelä Mikko Tolonen Jouni Tuominen }, author = {Rouces, Jacobo and Borin, Lars and Tahmasebi, Nina and Rødven-Eide, Stian }, year = {2018}, publisher = {University of Helsinki, Faculty of Arts}, address = {Helsinki}, } @inProceedings{r?dveneide-2020-anforanden-302449, title = {Anföranden: Annotated and Augmented Parliamentary Debates from Sweden}, abstract = {The Swedish parliamentary debates have been available since 2010 through the parliament’s open data web site Riksdagens öppna data. While fairly comprehensive, the structure of the data can be hard to understand and its content is somewhat noisy for use as a quality language resource. In order to make it easier to use and process – in particular for language technology research, but also for political science and other fields with an interest in parliamentary data – we have published a large selection of the debates in a cleaned and structured format, annotated with linguistic information and augmented with semantic links. Especially prevalent in the parliament’s data were end-line hyphenations – something that tokenisers generally are not equipped for – and a lot of the effort went into resolving these. In this paper, we provide detailed descriptions of the structure and contents of the resource, and explain how it differs from the parliament’s own version.}, booktitle = {Proceedings of the LREC 2020 Workshop on Creating, Using and Linking of Parliamentary Corpora with Other Types of Political Discourse, 11–16 May 2020}, author = {Rødven-Eide, Stian }, year = {2020}, publisher = {European Language Resources Association}, address = {Marseille, France}, ISBN = {979-10-95546-47-4}, } @inProceedings{r?dveneide-2019-swedish-289474, title = {The Swedish PoliGraph}, abstract = {As part of a larger project on argument mining of Swedish parliamentary data, we have created a semantic graph that, together with named entity recognition and resolution (NER), should make it easier to establish connections between arguments in a given debate. The graph is essentially a semantic database that keeps track of Members of Parliament (MPs), in particular their presence in the parliament and activity in debates, but also party affiliation and participation in commissions. The hope is that the Swedish PoliGraph will enable us to perform named entity resolution on debates in the Swedish parliament with a high accuracy, with the aim of determining to whom an argument is directed.}, booktitle = {Proceedings of the 6th Workshop on Argument Mining, August 1, 2019 Florence, Italy / Benno Stein, Henning Wachsmuth (Editors)}, author = {Rødven-Eide, Stian }, year = {2019}, publisher = {Association for Computational Linguistics}, address = {Stroudsburg, PA}, ISBN = {978-1-950737-33-8}, } @inProceedings{r?dveneide-etal-2016-swedish-250073, title = {The Swedish Culturomics Gigaword Corpus: A One Billion Word Swedish Reference Dataset for NLP}, abstract = {In this paper we present a dataset of contemporary Swedish containing one billion words. The dataset consists of a wide range of sources, all annotated using a state-of-the-art corpus annotation pipeline, and is intended to be a static and clearly versioned dataset. This will facilitate reproducibility of experiments across institutions and make it easier to compare NLP algorithms on contemporary Swedish. The dataset contains sentences from 1950 to 2015 and has been carefully designed to feature a good mix of genres balanced over each included decade. The sources include literary, journalistic, academic and legal texts, as well as blogs and web forum entries.}, booktitle = {Linköping Electronic Conference Proceedings. Digital Humanities 2016. From Digitization to Knowledge 2016: Resources and Methods for Semantic Processing of Digital Works/Texts, July 11, 2016, Krakow, Poland}, author = {Rødven-Eide, Stian and Tahmasebi, Nina and Borin, Lars}, year = {2016}, publisher = {Linköping University Electronic Press}, address = {Linköping}, ISBN = {978-91-7685-733-5}, } @inProceedings{volodina-etal-2014-what-206132, title = {You get what you annotate: a pedagogically annotated corpus of coursebooks for Swedish as a Second Language.}, abstract = {We present the COCTAILL corpus, containing over 700.000 tokens of Swedish texts from 12 coursebooks aimed at second/foreign language (L2) learning. Each text in the corpus is labelled with a proficiency level according to the CEFR proficiency scale. Genres, topics, associated activities, vocabulary lists and other types of information are annotated in the coursebooks to facilitate Second Language Acquisition (SLA)-aware studies and experiments aimed at Intelligent Computer-Assisted Language Learning (ICALL). Linguistic annotation in the form of parts-of-speech (POS; e.g. nouns, verbs), base forms (lemmas) and syntactic relations (e.g. subject, object) has been also added to the corpus. In the article we describe our annotation scheme and the editor we have developed for the content mark-up of the coursebooks, including the taxonomy of pedagogical activities and linguistic skills. Inter-annotator agreement has been computed and reported on a subset of the corpus. Surprisingly, we have not found any other examples of pedagogically marked-up corpora based on L2 coursebooks to draw on existing experiences. Hence, our work may be viewed as “groping in the darkness” and eventually a starting point for others. The paper also presents our first quantitative exploration of the corpus where we focus on textually and pedagogically annotated features of the coursebooks to exemplify what types of studies can be performed using the presented annotation scheme. We explore trends shown in use of topics and genres over proficiency levels and compare pedagogical focus of exercises across levels. The final section of the paper summarises the potential this corpus holds for research within SLA and various ICALL tasks. }, booktitle = {NEALT Proceedings Series}, author = {Volodina, Elena and Pilán, Ildikó and Rødven-Eide, Stian and Heidarsson, Hannes}, year = {2014}, volume = {22}, ISBN = {978-91-7519-175-1}, pages = {128--144}, }