@inProceedings{nietopina-johansson-2016-embedding-241139, title = {Embedding Senses for Efficient Graph-based Word Sense Disambiguation}, abstract = {We propose a simple graph-based method for word sense disambiguation (WSD) where sense and context embeddings are constructed by applying the Skip-gram method to random walks over the sense graph. We used this method to build a WSD system for Swedish using the SALDO lexicon, and evaluated it on six different annotated test sets. In all cases, our system was several orders of magnitude faster than a state-of-the-art PageRank-based system, while outperforming a random baseline soundly.}, booktitle = { Proceedings of TextGraphs-10: the Workshop on Graph-based Methods for Natural Language Processing}, author = {Nieto Piña, Luis and Johansson, Richard}, year = {2016}, publisher = {Association for Computational Linguistics}, } @book{nietopina-2019-splitting-282680, title = {Splitting rocks: Learning word sense representations from corpora and lexica}, abstract = {The representation of written language semantics is a central problem of language technology and a crucial component of many natural language processing applications, from part-of-speech tagging to text summarization. These representations of linguistic units, such as words or sentences, allow computer applications that work with language to process and manipulate the meaning of text. In particular, a family of models has been successfully developed based on automatically learning semantics from large collections of text and embedding them into a vector space, where semantic or lexical similarity is a function of geometric distance. Co-occurrence information of words in context is the main source of data used to learn these representations. Such models have typically been applied to learning representations for word forms, which have been widely applied, and proven to be highly successful, as characterizations of semantics at the word level. However, a word-level approach to meaning representation implies that the different meanings, or senses, of any polysemic word share one single representation. This might be problematic when individual word senses are of interest and explicit access to their specific representations is required. For instance, in cases such as an application that needs to deal with word senses rather than word forms, or when a digital lexicon's sense inventory has to be mapped to a set of learned semantic representations. In this thesis, we present a number of models that try to tackle this problem by automatically learning representations for word senses instead of for words. In particular, we try to achieve this by using two separate sources of information: corpora and lexica for the Swedish language. Throughout the five publications compiled in this thesis, we demonstrate that it is possible to generate word sense representations from these sources of data individually and in conjunction, and we observe that combining them yields superior results in terms of accuracy and sense inventory coverage. Furthermore, in our evaluation of the different representational models proposed here, we showcase the applicability of word sense representations both to downstream natural language processing applications and to the development of existing linguistic resources.}, author = {Nieto Piña, Luis}, year = {2019}, publisher = {University of Gothenburg}, address = {Gothenburg}, ISBN = {978-91-87850-75-2}, } @techreport{ljunglof-etal-2019-assessing-281222, title = {Assessing the quality of Språkbanken’s annotations}, abstract = {Most of the corpora in Språkbanken Text consist of unannotated plain text, such as almost all newspaper texts, social media texts, novels and official documents. We also have some corpora that are manually annotated in different ways, such as Talbanken (annotated for part-of-speech and syntactic structure), and the Stockholm Umeå Corpus (annotated for part-of-speech). Språkbanken’s annotation pipeline Sparv aims to automatise the work of automatically annotating all our corpora, while still keeping the manual annotations intact. When all corpora are annotated, they can be made available, e.g., in the corpus searh tools Korp and Strix. Until now there has not been any comprehensive overview of the annotation tools and models that Sparv has been using for the last eight years. Some of them have not been updated since the start, such as the part-of-speech tagger Hunpos and the dependency parser MaltParser. There are also annotation tools that we still have not included, such as a constituency-based parser. Therefore Språkbanken initiated a project with the aim of conducting such an overview. This document is the outcome of that project, and it contains descriptions of the types of manual and automatic annotations that we currently have in Språkbanken, as well as an incomplete overview of the state-of-the-art with regards to annotation tools and models. }, author = {Ljunglöf, Peter and Zechner, Niklas and Nieto Piña, Luis and Adesam, Yvonne and Borin, Lars}, year = {2019}, } @inProceedings{nietopina-johansson-2018-automatically-270261, title = {Automatically Linking Lexical Resources with Word Sense Embedding Models}, abstract = {Automatically learnt word sense embeddings are developed as an attempt to refine the capabilities of coarse word embeddings. The word sense representations obtained this way are, however, sensitive to underlying corpora and parameterizations, and they might be difficult to relate to word senses as formally defined by linguists. We propose to tackle this problem by devising a mechanism to establish links between word sense embeddings and lexical resources created by experts. We evaluate the applicability of these links in a task to retrieve instances of Swedish word senses not present in the lexicon.}, booktitle = {The Third Workshop on Semantic Deep Learning (SemDeep-3), August 20th, 2018, Santa Fe, New Mexico, USA / Luis Espinosa Anke, Thierry Declerck, Dagmar Gromann (eds.)}, author = {Nieto Piña, Luis and Johansson, Richard}, year = {2018}, ISBN = {978-1-948087-56-8}, } @inProceedings{nietopina-johansson-2017-training-261938, title = {Training Word Sense Embeddings With Lexicon-based Regularization}, abstract = {We propose to improve word sense embeddings by enriching an automatic corpus-based method with lexicographic data. Information from a lexicon is introduced into the learning algorithm’s objective function through a regularizer. The incorporation of lexicographic data yields embeddings that are able to reflect expertdefined word senses, while retaining the robustness, high quality, and coverage of automatic corpus-based methods. These properties are observed in a manual inspection of the semantic clusters that different degrees of regularizer strength create in the vector space. Moreover, we evaluate the sense embeddings in two downstream applications: word sense disambiguation and semantic frame prediction, where they outperform simpler approaches. Our results show that a corpusbased model balanced with lexicographic data learns better representations and improve their performance in downstream tasks}, booktitle = {Proceedings of the Eighth International Joint Conference on Natural Language Processing (Volume 1: Long Papers), Taipei, Taiwan, November 27 – December 1, 2017}, author = {Nieto Piña, Luis and Johansson, Richard}, year = {2017}, publisher = {Asian Federation of Natural Language Processing }, ISBN = {978-1-948087-00-1}, } @article{nietopina-johansson-2016-benchmarking-251412, title = {Benchmarking Word Sense Disambiguation Systems for Swedish}, abstract = {We compare several word sense disambiguation systems for Swedish and evaluate them on seven different sense-annotated corpora. Our results show that unsupervised systems beat a random baseline, but generally do not outperform a first-sense baseline considerably. On a lexical-sample dataset that allows us to train a supervised system, the unsupervised disambiguators are strongly outperformed by the supervised one.}, journal = {The Sixth Swedish Language Technology Conference}, author = {Nieto Piña, Luis and Johansson, Richard}, year = {2016}, } @inProceedings{nietopina-johansson-2015-simple-222611, title = {A Simple and Efficient Method to Generate Word Sense Representations}, abstract = {Distributed representations of words have boosted the performance of many Natural Language Processing tasks. However, usually only one representation per word is obtained, not acknowledging the fact that some words have multiple meanings. This has a negative effect on the individual word representations and the language model as a whole. In this paper we present a simple model that enables recent techniques for building word vectors to represent distinct senses of polysemic words. In our assessment of this model we show that it is able to effectively discriminate between words’ senses and to do so in a computationally efficient manner.}, booktitle = {Proceedings of International Conference in Recent Advances in Natural Language Processing}, editor = {Galia Angelova and Kalina Bontcheva and Ruslan Mitkov and Hissar and Bulgaria 7–9 September and 2015}, author = {Nieto Piña, Luis and Johansson, Richard}, year = {2015}, pages = {465--472}, } @inProceedings{johansson-nietopina-2015-embedding-217863, title = {Embedding a Semantic Network in a Word Space}, abstract = {We present a framework for using continuous- space vector representations of word meaning to derive new vectors representing the meaning of senses listed in a semantic network. It is a post-processing approach that can be applied to several types of word vector representations. It uses two ideas: first, that vectors for polysemous words can be decomposed into a convex combination of sense vectors; secondly, that the vector for a sense is kept similar to those of its neighbors in the network.This leads to a constrained optimization problem, and we present an approximation for the case when the distance function is the squared Euclidean. We applied this algorithm on a Swedish semantic network, and we evaluate the quality of the resulting sense representations extrinsically by showing that they give large improvements when used in a classifier that creates lexical units for FrameNet frames. }, booktitle = {Proceedings of the 2015 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies. Denver, United States, May 31 – June 5, 2015}, author = {Johansson, Richard and Nieto Piña, Luis}, year = {2015}, ISBN = {978-1-941643-49-5}, pages = {1428--1433}, } @inProceedings{borin-etal-2015-here-217351, title = {Here be dragons? The perils and promises of inter-resource lexical-semantic mapping}, abstract = {Lexical-semantic knowledges sources are a stock item in the language technologist’s toolbox, having proved their practical worth in many and diverse natural language processing (NLP) applications. In linguistics, lexical semantics comes in many flavors, but in the NLP world, wordnets reign more or less supreme. There has been some promising work utilizing Roget-style thesauruses instead, but wider experimentation is hampered by the limited availability of such resources. The work presented here is a first step in the direction of creating a freely available Roget-style lexical resource for modern Swedish. Here, we explore methods for automatic disambiguation of interresource mappings with the longer-term goal of utilizing similar techniques for automatic enrichment of lexical-semantic resources.}, booktitle = {Linköping Electronic Conference Proceedings. Semantic resources and semantic annotation for Natural Language Processing and the Digital Humanities. Workshop at NODALIDA , May 11, 13-18 2015, Vilnius}, author = {Borin, Lars and Nieto Piña, Luis and Johansson, Richard}, year = {2015}, volume = {112}, ISBN = {978-91-7519-049-5}, pages = {1--11}, } @inProceedings{johansson-nietopina-2015-combining-216865, title = {Combining Relational and Distributional Knowledge for Word Sense Disambiguation}, abstract = {We present a new approach to word sense disambiguation derived from recent ideas in distributional semantics. The input to the algorithm is a large unlabeled corpus and a graph describing how senses are related; no sense-annotated corpus is needed. The fundamental idea is to embed meaning representations of senses in the same continuous-valued vector space as the representations of words. In this way, the knowledge encoded in the lexical resource is combined with the infor- mation derived by the distributional methods. Once this step has been carried out, the sense representations can be plugged back into e.g. the skip-gram model, which allows us to compute scores for the different possible senses of a word in a given context. We evaluated the new word sense disambiguation system on two Swedish test sets annotated with senses defined by the SALDO lexical resource. In both evaluations, our system soundly outperformed random and first-sense baselines. Its accuracy was slightly above that of a well- known graph-based system, while being computationally much more efficient,}, booktitle = {Proceedings of the 20th Nordic Conference of Computational Linguistics, May 12-13, Vilnius, Lithuania. Linköping Electronic Conference Proceedings 109, Linköping University Electronic Press..}, author = {Johansson, Richard and Nieto Piña, Luis}, year = {2015}, ISBN = {978-91-7519-098-3}, pages = {69--78}, }