@inProceedings{saynova-etal-2023-class-326358, title = {Class Explanations: the Role of Domain-Specific Content and Stop Words}, abstract = {We address two understudied areas related to explainability for neural text models. First, class explanations. What features are descriptive across a class, rather than explaining single input instances? Second, the type of features that are used for providing explanations. Does the explanation involve the statistical pattern of word usage or the presence of domain-specific content words? Here, we present a method to extract both class explanations and strategies to differentiate between two types of explanations – domain-specific signals or statistical variations in frequencies of common words. We demonstrate our method using a case study in which we analyse transcripts of political debates in the Swedish Riksdag.}, booktitle = {Proceedings of the 24th Nordic Conference on Computational Linguistics (NoDaLiDa), pages 103–112, Tórshavn, Faroe Islands}, author = {Saynova, Denitsa and Bruinsma, Bastiaan and Johansson, Moa and Johansson, Richard}, year = {2023}, publisher = {University of Tartu Library}, } @inProceedings{farahani-johansson-2024-deciphering-343041, title = {Deciphering the Interplay of Parametric and Non-parametric Memory in Retrieval-augmented Language Models}, abstract = {Generative language models often struggle with specialized or less-discussed knowledge. A potential solution is found in Retrieval-Augmented Generation (RAG) models which act like retrieving information before generating responses. In this study, we explore how the Atlas approach, a RAG model, decides between what it already knows (parametric) and what it retrieves (non-parametric). We use causal mediation analysis and controlled experiments to examine how internal representations influence information processing. Our findings disentangle the effects of parametric knowledge and the retrieved context. They indicate that in cases where the model can choose between both types of information (parametric and non-parametric), it relies more on the context than the parametric knowledge. Furthermore, the analysis investigates the computations involved in how the model uses the information from the context. We find that multiple mechanisms are active within the model and can be detected with mediation analysis: first, the decision of whether the context is relevant, and second, how the encoder computes output representations to support copying when relevant.}, booktitle = {Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing, pages 16966–16977, Miami, USA}, author = {Farahani, Mehrdad and Johansson, Richard}, year = {2024}, publisher = {Association for Computational Linguistics}, } @inProceedings{dannells-etal-2024-transformer-338708, title = {Transformer-based Swedish Semantic Role Labeling through Transfer Learning}, abstract = {Semantic Role Labeling (SRL) is a task in natural language understanding where the goal is to extract semantic roles for a given sentence. English SRL has achieved state-of-the-art performance using Transformer techniques and supervised learning. However, this technique is not a viable choice for smaller languages like Swedish due to the limited amount of training data. In this paper, we present the first effort in building a Transformer-based SRL system for Swedish by exploring multilingual and cross-lingual transfer learning methods and leveraging the Swedish FrameNet resource. We demonstrate that multilingual transfer learning outperforms two different cross-lingual transfer models. We also found some differences between frames in FrameNet that can either hinder or enhance the model’s performance. The resulting end-to-end model is freely available and will be made accessible through Språkbanken Text’s research infrastructure.}, booktitle = {Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024), 20-25 May, 2024, Torino, Italia}, author = {Dannélls, Dana and Johansson, Richard and Buhr, Lucy Yang}, year = {2024}, publisher = {ELRA and ICCL}, address = {Turin, Italy}, ISBN = {978-2-493814-10-4}, } @inProceedings{johansson-2024-what-337926, title = {What Happens to a Dataset Transformed by a Projection-based Concept Removal Method?}, abstract = {We investigate the behavior of methods using linear projections to remove information about a concept from a language representation, and we consider the question of what happens to a dataset transformed by such a method. A theoretical analysis and experiments on real-world and synthetic data show that these methods inject strong statistical dependencies into the transformed datasets. After applying such a method, the representation space is highly structured: in the transformed space, an instance tends to be located near instances of the opposite label. As a consequence, the original labeling can in some cases be reconstructed by applying an anti-clustering method.}, booktitle = {Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024), pages 17486–17492, Torino, Italy.}, author = {Johansson, Richard}, year = {2024}, publisher = {ELRA and ICCL}, } @inProceedings{hagstrom-etal-2023-effect-331015, title = {The Effect of Scaling, Retrieval Augmentation and Form on the Factual Consistency of Language Models}, abstract = {Large Language Models (LLMs) make natural interfaces to factual knowledge, but their usefulness is limited by their tendency to deliver inconsistent answers to semantically equivalent questions. For example, a model might supply the answer “Edinburgh” to “Anne Redpath passed away in X.” and “London” to “Anne Redpath’s life ended in X.” In this work, we identify potential causes of inconsistency and evaluate the effectiveness of two mitigation strategies: up-scaling and augmenting the LM with a passage retrieval database. Our results on the LLaMA and Atlas models show that both strategies reduce inconsistency but that retrieval augmentation is considerably more efficient. We further consider and disentangle the consistency contributions of different components of Atlas. For all LMs evaluated we find that syntactical form and task artifacts impact consistency. Taken together, our results provide a better understanding of the factors affecting the factual consistency of language models.}, booktitle = {Proceedings of the 2023 Conference on Empirical Methods in Natural Language Processing, pages 5457–5476, Singapore}, author = {Hagström, Lovisa and Saynova, Denitsa and Norlund, Tobias and Johansson, Moa and Johansson, Richard}, year = {2023}, publisher = {Association for Computational Linguistics}, } @inProceedings{ghosh-etal-2011-discourse-151350, title = {End-to-End Discourse Parser Evaluation}, abstract = {We are interested in the problem of discourse parsing of textual documents. We present a novel end-to-end discourse parser that, given a plain text document in input, identifies the discourse relations in the text, assigns them a semantic label and detects discourse arguments spans. The parsing architecture is based on a cascade of decisions supported by Conditional Random Fields (CRF). We train and evaluate three different parsers using the PDTB corpus. The three system versions are compared to evaluate their robustness with respect to deep/shallow and automatically extracted syntactic features.}, booktitle = {Fifth IEEE International Conference on Semantic Computing (ICSC), 2011; September 18-21, 2011; Palo Alto, United States}, author = {Ghosh, Sucheta and Tonelli, Sara and Riccardi, Giuseppe and Johansson, Richard}, year = {2011}, ISBN = {978-1-4577-1648-5}, } @inProceedings{mogren-johansson-2017-character-256929, title = {Character-based Recurrent Neural Networks for Morphological Relational Reasoning}, abstract = {We present a model for predicting word forms based on morphological relational reasoning with analogies. While previous work has explored tasks such as morphological inflection and reinflection, these models rely on an explicit enumeration of morphological features, which may not be available in all cases. To address the task of predicting a word form given a demo relation (a pair of word forms) and a query word, we devise a character-based recurrent neural network architecture using three separate encoders and a decoder. We also investigate a multiclass learning setup, where the prediction of the relation type label is used as an auxiliary task. Our results show that the exact form can be predicted for English with an accuracy of 94.7%. For Swedish, which has a more complex morphology with more inflectional patterns for nouns and verbs, the accuracy is 89.3%. We also show that using the auxiliary task of learning the relation type speeds up convergence and improves the prediction accuracy for the word generation task.}, booktitle = {Proceedings of the First Workshop on Subword and Character Level Models in NLP}, author = {Mogren, Olof and Johansson, Richard}, year = {2017}, publisher = {Association for Computational Linguistics}, address = {Stroudsburg, PA, United States}, } @inProceedings{johansson-etal-2016-multi-233140, title = {A Multi-domain Corpus of Swedish Word Sense Annotation}, abstract = {We describe the word sense annotation layer in Eukalyptus, a freely available five-domain corpus of contemporary Swedish with several annotation layers. The annotation uses the SALDO lexicon to define the sense inventory, and allows word sense annotation of compound segments and multiword units. We give an overview of the new annotation tool developed for this project, and finally present an analysis of the inter-annotator agreement between two annotators. }, booktitle = {10th edition of the Language Resources and Evaluation Conference, 23-28 May 2016, Portorož (Slovenia)}, author = {Johansson, Richard and Adesam, Yvonne and Bouma, Gerlof and Hedberg, Karin}, year = {2016}, publisher = {European Language Resources Association}, ISBN = {978-2-9517408-9-1}, } @inProceedings{adouane-johansson-2016-gulf-242243, title = {Gulf Arabic Resource Building for Sentiment Analysis}, abstract = {This paper deals with building linguistic resources for Gulf Arabic, one of the Arabic variations, for sentiment analysis task using machine learning. To our knowledge, no previous works were done for Gulf Arabic sentiment analysis despite the fact that it is present in different online platforms. Hence, the first challenge is the absence of annotated data and sentiment lexicons. To fill this gap, we created these two main linguistic resources. Then we conducted different experiments: use Naive Bayes classifier without any lexicon; add a sentiment lexicon designed basically for MSA; use only the compiled Gulf Arabic sentiment lexicon and finally use both MSA and Gulf Arabic sentiment lexicons. The Gulf Arabic lexicon gives a good improvement of the classifier accuracy (90.54 %) over a baseline that does not use the lexicon (82.81%), while the MSA lexicon causes the accuracy to drop to (76.83%). Moreover, mixing MSA and Gulf Arabic lexicons causes the accuracy to drop to (84.94%) compared to using only Gulf Arabic lexicon. This indicates that it is useless to use MSA resources to deal with Gulf Arabic due to the considerable differences and conflicting structures between these two languages.}, booktitle = {Proceedings of the Language Resources and Evaluation Conference (LREC), 23-28 May 2016, Portorož, Slovenia}, author = {Adouane, Wafia and Johansson, Richard}, year = {2016}, publisher = {European Language Resources Association}, ISBN = {978-2-9517408-9-1}, } @inProceedings{ehrlemark-etal-2016-retrieving-242241, title = {Retrieving Occurrences of Grammatical Constructions}, abstract = {Finding authentic examples of grammatical constructions is central in constructionist approaches to linguistics, language processing, and second language learning. In this paper, we address this problem as an information retrieval (IR) task. To facilitate research in this area, we built a benchmark collection by annotating the occurrences of six constructions in a Swedish corpus. Furthermore, we implemented a simple and flexible retrieval system for finding construction occurrences, in which the user specifies a ranking function using lexical-semantic similarities (lexicon-based or distributional). The system was evaluated using standard IR metrics on the new benchmark, and we saw that lexical-semantical rerankers improve significantly over a purely surface-oriented system, but must be carefully tailored for each individual construction. }, booktitle = {Proceedings of COLING 2016, the 26th International Conference on Computational Linguistics : Technical Papers, December 11–17; Osaka, Japan}, author = {Ehrlemark, Anna and Johansson, Richard and Lyngfelt, Benjamin}, year = {2016}, ISBN = {978-4-87974-702-0}, } @inProceedings{nietopina-johansson-2016-embedding-241139, title = {Embedding Senses for Efficient Graph-based Word Sense Disambiguation}, abstract = {We propose a simple graph-based method for word sense disambiguation (WSD) where sense and context embeddings are constructed by applying the Skip-gram method to random walks over the sense graph. We used this method to build a WSD system for Swedish using the SALDO lexicon, and evaluated it on six different annotated test sets. In all cases, our system was several orders of magnitude faster than a state-of-the-art PageRank-based system, while outperforming a random baseline soundly.}, booktitle = { Proceedings of TextGraphs-10: the Workshop on Graph-based Methods for Natural Language Processing}, author = {Nieto Piña, Luis and Johansson, Richard}, year = {2016}, publisher = {Association for Computational Linguistics}, } @article{hagberg-etal-2022-semi-314455, title = {Semi-supervised learning with natural language processing for right ventricle classification in echocardiography—a scalable approach}, abstract = {We created a deep learning model, trained on text classified by natural language processing (NLP), to assess right ventricular (RV) size and function from echocardiographic images. We included 12,684 examinations with corresponding written reports for text classification. After manual annotation of 1489 reports, we trained an NLP model to classify the remaining 10,651 reports. A view classifier was developed to select the 4-chamber or RV-focused view from an echocardiographic examination (n = 539). The final models were two image classification models trained on the predicted labels from the combined manual annotation and NLP models and the corresponding echocardiographic view to assess RV function (training set n = 11,008) and size (training set n = 9951. The text classifier identified impaired RV function with 99% sensitivity and 98% specificity and RV enlargement with 98% sensitivity and 98% specificity. The view classification model identified the 4-chamber view with 92% accuracy and the RV-focused view with 73% accuracy. The image classification models identified impaired RV function with 93% sensitivity and 72% specificity and an enlarged RV with 80% sensitivity and 85% specificity; agreement with the written reports was substantial (both κ = 0.65). Our findings show that models for automatic image assessment can be trained to classify RV size and function by using model-annotated data from written echocardiography reports. This pipeline for auto-annotation of the echocardiographic images, using a NLP model with medical reports as input, can be used to train an image-assessment model without manual annotation of images and enables fast and inexpensive expansion of the training dataset when needed. © 2022}, journal = {Computers in Biology and Medicine}, author = {Hagberg, Eva and Hagerman, David and Johansson, Richard and Hosseini, N. and Liu, J. and Björnsson, E. and Alvén, Jennifer and Hjelmgren, Ola}, year = {2022}, volume = {143}, } @inProceedings{doostmohammadi-etal-2023-surface-327186, title = {Surface-Based Retrieval Reduces Perplexity of Retrieval-Augmented Language Models}, abstract = {Augmenting language models with a retrieval mechanism has been shown to significantly improve their performance while keeping the number of parameters low. Retrieval-augmented models commonly rely on a semantic retrieval mechanism based on the similarity between dense representations of the query chunk and potential neighbors. In this paper, we study the state-of-the-art Retro model and observe that its performance gain is better explained by surface-level similarities, such as token overlap. Inspired by this, we replace the semantic retrieval in Retro with a surface-level method based on BM25, obtaining a significant reduction in perplexity. As full BM25 retrieval can be computationally costly for large datasets, we also apply it in a re-ranking scenario, gaining part of the perplexity reduction with minimal computational overhead.}, booktitle = { Proceedings of the 61st Annual Meeting of the Association for Computational Linguistics (Volume 2: Short Papers), pages 521–529, Toronto, Canada}, author = {Doostmohammadi, Ehsan and Norlund, Tobias and Kuhlmann, Marco and Johansson, Richard}, year = {2023}, publisher = {Association for Computational Linguistics}, } @inProceedings{norlund-etal-2023-generalization-326357, title = {On the Generalization Ability of Retrieval-Enhanced Transformers}, abstract = {Recent work on the Retrieval-Enhanced Transformer (RETRO) model has shown impressive results: off-loading memory from trainable weights to a retrieval database can significantly improve language modeling and match the performance of non-retrieval models that are an order of magnitude larger in size. It has been suggested that at least some of this performance gain is due to non-trivial generalization based on both model weights and retrieval. In this paper, we try to better understand the relative contributions of these two components. We find that the performance gains from retrieval to a very large extent originate from overlapping tokens between the database and the test data, suggesting less of non-trivial generalization than previously assumed. More generally, our results point to the challenges of evaluating the generalization of retrieval-augmented language models such as RETRO, as even limited token overlap may significantly decrease test-time loss. We release our code and model at https://github.com/TobiasNorlund/retro}, booktitle = {Findings of the Association for Computational Linguistics: EACL 2023, pages 1485–1493, Dubrovnik, Croatia}, author = {Norlund, Tobias and Doostmohammadi, Ehsan and Johansson, Richard and Kuhlmann, Marco}, year = {2023}, publisher = {Association for Computational Linguistics}, } @inProceedings{malik-johansson-2022-controlling-323885, title = {Controlling for Stereotypes in Multimodal Language Model Evaluation}, abstract = {We propose a methodology and design two benchmark sets for measuring to what extent language-and-vision language models use the visual signal in the presence or absence of stereotypes. The first benchmark is designed to test for stereotypical colors of common objects, while the second benchmark considers gender stereotypes. The key idea is to compare predictions when the image conforms to the stereotype to predictions when it does not. Our results show that there is significant variation among multimodal models: the recent Transformer-based FLAVA seems to be more sensitive to the choice of image and less affected by stereotypes than older CNN-based models such as VisualBERT and LXMERT. This effect is more discernible in this type of controlled setting than in traditional evaluations where we do not know whether the model relied on the stereotype or the visual signal.}, booktitle = {Proceedings of the Fifth BlackboxNLP Workshop on Analyzing and Interpreting Neural Networks for NLP, pages 263-271, Abu Dhabi}, author = {Malik, Manuj and Johansson, Richard}, year = {2022}, publisher = {Association for Computational Linguistics}, } @incollection{johansson-2022-coveting-326360, title = {Coveting Your Neighbor's Wife: Using Lexical Neighborhoods in Substitution-based Word Sense Disambiguation}, abstract = {We explore a simple approach to word sense disambiguation for the case where a graph-structured lexicon of word sense identifiers is available, but no definitions or annotated training examples. The key idea is to consider the neighborhood in a lexical graph to generate a set of potential substitutes of the target word, which can then be compared to a set of substitutes suggested by a language model for a given context. We applied the proposed method to the SALDO lexicon for Swedish and used a BERT model to propose contextual substitutes. The system was evaluated on sense-annotated corpora, and despite its simplicity we see a strong improvement over previously proposed models for unsupervised SALDO-based word sense disambiguation.}, booktitle = {LIVE and LEARN – Festschrift in honor of Lars Borin; Volodina, Elena, Dannélls, Dana, Berdicevskis, Aleksandrs, Forsberg, Markus, and Virk, Shafqat (editors)}, author = {Johansson, Richard}, year = {2022}, publisher = {GU-ISS Forskningsrapporter från Institutionen för svenska, flerspråkighet och språkteknologi}, address = {Göteborg}, pages = {61--66}, } @inProceedings{farahani-johansson-2023-empirical-326359, title = {An Empirical Study of Multitask Learning to Improve Open Domain Dialogue Systems}, abstract = {Autoregressive models used to generate responses in open-domain dialogue systems often struggle to take long-term context into account and to maintain consistency over a dialogue. Previous research in open-domain dialogue generation has shown that the use of auxiliary tasks can introduce inductive biases that encourage the model to improve these qualities. However, most previous research has focused on encoder-only or encoder/decoder models, while the use of auxiliary tasks in encoder-only autoregressive models is under-explored. This paper describes an investigation where four different auxiliary tasks are added to small and medium-sized GPT-2 models fine-tuned on the PersonaChat and DailyDialog datasets. The results show that the introduction of the new auxiliary tasks leads to small but consistent improvement in evaluations of the investigated models.}, booktitle = { Proceedings of the 24th Nordic Conference on Computational Linguistics (NoDaLiDa), pages 347–357, Tórshavn, Faroe Islands }, author = {Farahani, Mehrdad and Johansson, Richard}, year = {2023}, publisher = {University of Tartu Library }, } @misc{ljunglof-etal-2021-selected-306645, title = {Selected contributions from the Eighth Swedish Language Technology Conference (SLTC-2020), 25-27 November 2020}, abstract = {Selected extended papers from the Eight Swedish Language Technology Conference (SLTC-2020) which was held between 25-27 November 2020 in Gothenburg and online.}, author = {Ljunglöf, Peter and Dobnik, Simon and Johansson, Richard}, year = {2021}, publisher = {Linköping University Electronic Press}, address = {Linköping, Sweden}, ISBN = {978-91-7929-031-3}, } @inProceedings{raj-etal-2022-cross-323886, title = {Cross-modal Transfer Between Vision and Language for Protest Detection}, abstract = {Most of today’s systems for socio-political event detection are text-based, while an increasing amount of information published on the web is multi-modal. We seek to bridge this gap by proposing a method that utilizes existing annotated unimodal data to perform event detection in another data modality, zero-shot. Specifically, we focus on protest detection in text and images, and show that a pretrained vision-and-language alignment model (CLIP) can be leveraged towards this end. In particular, our results suggest that annotated protest text data can act supplementarily for detecting protests in images, but significant transfer is demonstrated in the opposite direction as well.}, booktitle = {Proceedings of the 5th Workshop on Challenges and Applications of Automated Extraction of Socio-political Events from Text (CASE), pages 56-60, Abu Dhabi}, author = {Raj, Ria and Andréasson, Kajsa and Norlund, Tobias and Johansson, Richard and Lagerberg, Aron}, year = {2022}, publisher = {Association for Computational Linguistics}, } @inProceedings{hagstrom-johansson-2022-adapt-319269, title = {How to Adapt Pre-trained Vision-and-Language Models to a Text-only Input?}, abstract = {Current language models have been criticised for learning language from text alone without connection between words and their meaning. Consequently, multimodal training has been proposed as a way for creating models with better language understanding by providing the lacking connection. We focus on pre-trained multimodal vision-and-language (VL) models for which there already are some results on their language understanding capabilities. An unresolved issue with evaluating the linguistic skills of these models, however, is that there is no established method for adapting them to text-only input without out-of-distribution uncertainty. To find the best approach, we investigate and compare seven possible methods for adapting three different pre-trained VL models to text-only input. Our evaluations on both GLUE and Visual Property Norms (VPN) show that care should be put into adapting VL models to zero-shot text-only tasks, while the models are less sensitive to how we adapt them to non-zero-shot tasks. We also find that the adaptation methods perform differently for different models and that unimodal model counterparts perform on par with the VL models regardless of adaptation, indicating that current VL models do not necessarily gain better language understanding from their multimodal training.}, booktitle = {Proceedings of the 29th International Conference on Computational Linguistics, pages 5582–5596, Gyeongju, Republic of Korea}, author = {Hagström, Lovisa and Johansson, Richard}, year = {2022}, publisher = {International Committee on Computational Linguistics}, } @inProceedings{hagstrom-johansson-2022-small-318477, title = {Can We Use Small Models to Investigate Multimodal Fusion Methods?}, abstract = {Many successful methods for fusing language with information from the visual modality have recently been proposed and the topic of multimodal training is ever evolving. However, it is still largely not known what makes different vision-and-language models successful. Investigations into this are made difficult by the large sizes of the models used, requiring large training datasets and causing long train and compute times. Therefore, we propose the idea of studying multimodal fusion methods in a smaller setting with small models and datasets. In this setting, we can experiment with different approaches for fusing multimodal information with language in a controlled fashion, while allowing for fast experimentation. We illustrate this idea with the math arithmetics sandbox. This is a setting in which we fuse language with information from the math modality and strive to replicate some fusion methods from the vision-and-language domain. We find that some results for fusion methods from the larger domain translate to the math arithmetics sandbox, indicating a promising future avenue for multimodal model prototyping.}, booktitle = {Proceedings of the 2022 CLASP Conference on (Dis)embodiment, Gothenburg, Sweden, pages 45-50.}, author = {Hagström, Lovisa and Johansson, Richard}, year = {2022}, publisher = {Association for Computational Linguistics}, } @inProceedings{johansson-2012-atomic-156993, title = {Non-atomic Classification to Improve a Semantic Role Labeler for a Low-resource Language}, abstract = {Semantic role classification accuracy for most languages other than English is constrained by the small amount of annotated data. In this paper, we demonstrate how the frame-to-frame relations described in the FrameNet ontology can be used to improve the performance of a FrameNet-based semantic role classifier for Swedish, a low-resource language. In order to make use of the FrameNet relations, we cast the semantic role classification task as a non-atomic label prediction task. The experiments show that the cross-frame generalization methods lead to a 27% reduction in the number of errors made by the classifier. For previously unseen frames, the reduction is even more significant: 50%. }, booktitle = {Proceedings of the First Joint Conference on Lexical and Computational Semantics (*SEM); June 7-8; Montréal, Canada}, author = {Johansson, Richard}, year = {2012}, publisher = {Association for Computational Linguistics}, address = {Montréal, Canada}, } @inProceedings{daoud-etal-2022-conceptualizing-317410, title = {Conceptualizing Treatment Leakage in Text-based Causal Inference}, abstract = {Causal inference methods that control for text-based confounders are becoming increasingly important in the social sciences and other disciplines where text is readily available. However, these methods rely on a critical assumption that there is no treatment leakage: that is, the text only contains information about the confounder and no information about treatment assignment. When this assumption does not hold, methods that control for text to adjust for confounders face the problem of post-treatment (collider) bias. However, the assumption that there is no treatment leakage may be unrealistic in real-world situations involving text, as human language is rich and flexible. Language appearing in a public policy document or health records may refer to the future and the past simultaneously, and thereby reveal information about the treatment assignment.In this article, we define the treatment-leakage problem, and discuss the identification as well as the estimation challenges it raises. Second, we delineate the conditions under which leakage can be addressed by removing the treatment-related signal from the text in a pre-processing step we define as text distillation. Lastly, using simulation, we show how treatment leakage introduces a bias in estimates of the average treatment effect (ATE) and how text distillation can mitigate this bias.}, booktitle = { Proceedings of the 2022 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, pp. 5638–5645, Seattle, United States}, author = {Daoud, Adel and Jerzak, Connor T. and Johansson, Richard}, year = {2022}, publisher = {Association for Computational Linguistics}, } @inProceedings{hagstrom-johansson-2022-what-316251, title = {What do Models Learn From Training on More Than Text? Measuring Visual Commonsense Knowledge}, abstract = {There are limitations in learning language from text alone. Therefore, recent focus has been on developing multimodal models. However, few benchmarks exist that can measure what language models learn about language from multimodal training. We hypothesize that training on a visual modality should improve on the visual commonsense knowledge in language models. Therefore, we introduce two evaluation tasks for measuring visual commonsense knowledge in language models (code publicly available at: github.com/lovhag/measure-visual-commonsense-knowledge) and use them to evaluate different multimodal models and unimodal baselines. Primarily, we find that the visual commonsense knowledge is not significantly different between the multimodal models and unimodal baseline models trained on visual text data.}, booktitle = { Proceedings of the 60th Annual Meeting of the Association for Computational Linguistics: Student Research Workshop, pp. 252–261, Dublin, Ireland}, author = {Hagström, Lovisa and Johansson, Richard}, year = {2022}, publisher = {Association for Computational Linguistics}, } @inProceedings{norlund-etal-2021-transferring-309548, title = {Transferring Knowledge from Vision to Language: How to Achieve it and how to Measure it?}, abstract = {Large language models are known to suffer from the hallucination problem in that they are prone to output statements that are false or inconsistent, indicating a lack of knowledge. A proposed solution to this is to provide the model with additional data modalities that complements the knowledge obtained through text. We investigate the use of visual data to complement the knowledge of large language models by proposing a method for evaluating visual knowledge transfer to text for uni- or multimodal language models. The method is based on two steps, 1) a novel task querying for knowledge of memory colors, i.e. typical colors of well-known objects, and 2) filtering of model training data to clearly separate knowledge contributions. Additionally, we introduce a model architecture that involves a visual imagination step and evaluate it with our proposed method. We find that our method can successfully be used to measure visual knowledge transfer capabilities in models and that our novel model architecture shows promising results for leveraging multimodal knowledge in a unimodal setting.}, booktitle = {Proceedings of the Fourth BlackboxNLP Workshop on Analyzing and Interpreting Neural Networks for NLP, pp. 149-162, Punta Cana, Dominican Republic}, author = {Norlund, Tobias and Hagström, Lovisa and Johansson, Richard}, year = {2021}, publisher = {Association for Computational Linguistics}, } @incollection{johansson-etal-2021-semantic-310775, title = {Semantic Role Labeling}, booktitle = {The Swedish FrameNet++. Harmonization, integration, method development and practical language technology applications}, editor = {Dana Dannélls and Lars Borin and Karin Friberg Heppin}, author = {Johansson, Richard and Friberg Heppin, Karin and Kokkinakis, Dimitrios}, year = {2021}, publisher = {John Benjamins Publishing Company}, address = {Amsterdam / Philadelphia}, ISBN = {978 90 272 5848 9}, pages = {264–280}, } @incollection{johansson-2021-resource-310770, title = {NLP for Resource Building}, booktitle = {The Swedish FrameNet++. Harmonization, integration, method development and practical language technology applications}, author = {Johansson, Richard}, year = {2021}, publisher = {John Benjamins Publishing Company}, address = {Amsterdam / Philadelphia}, ISBN = {978 90 272 5848 9 }, pages = {169–190}, } @inProceedings{hagstrom-johansson-2021-knowledge-305832, title = {Knowledge Distillation for Swedish NER models: A Search for Performance and Efficiency}, abstract = {The current recipe for better model performance within NLP is to increase model size and training data. While it gives us models with increasingly impressive results, it also makes it more difficult to train and deploy state-of-the-art models for NLP due to increasing computational costs. Model compression is a field of research that aims to alleviate this problem. The field encompasses different methods that aim to preserve the performance of a model while decreasing the size of it. One such method is knowledge distillation. In this article, we investigate the effect of knowledge distillation for named entity recognition models in Swedish. We show that while some sequence tagging models benefit from knowledge distillation, not all models do. This prompts us to ask questions about in which situations and for which models knowledge distillation is beneficial. We also reason about the effect of knowledge distillation on computational costs.}, booktitle = {Proceedings of the 23rd Nordic Conference on Computational Linguistics (NoDaLiDa 2021), pp. 124–134. Reykjavík, Iceland.}, author = {Hagström, Lovisa and Johansson, Richard}, year = {2021}, publisher = {Linköping University Electronic Press}, address = {Linköping}, ISBN = {978-91-7929-614-8}, } @inProceedings{akerstrom-etal-2019-natural-284338, title = {Natural Language Processing in Policy Evaluation: Extracting Policy Conditions from IMF Loan Agreements}, abstract = {Social science researchers often use text as the raw data in investigations: for instance, when investigating the effects of IMF policies on the development of countries under IMF programs, researchers typically encode structured descriptions of the programs using a time-consuming manual effort. Making this process automatic may open up new opportunities in scaling up such investigations. As a first step towards automatizing this coding process, we describe an experiment where we apply a sentence classifier that automatically detects mentions of policy conditions in IMF loan agreements and divides them into different types. The results show that the classifier is generally able to detect the policy conditions, although some types are hard to distinguish.}, booktitle = {Proceedings of the 22nd Nordic Conference on Computational Linguistics; September 30 – October 2; Turku, Finland}, author = {Åkerström, Joakim and Daoud, Adel and Johansson, Richard}, year = {2019}, publisher = {Linköping University Electronic Press}, } @inProceedings{qwaider(abukwaik)-etal-2020-arabic-291768, title = {An Arabic Tweets Sentiment Analysis Dataset (ATSAD) using Distant Supervision and Self Training}, abstract = {As the number of social media users increases, they express their thoughts, needs, socialise and publish their opinions. For good social media sentiment analysis, good quality resources are needed, and the lack of these resources is particularly evident for languages other than English, in particular Arabic. The available Arabic resources lack of from either the size of the corpus or the quality of the annotation. In this paper, we present an Arabic Sentiment Analysis Corpus collected from Twitter, which contains 36K tweets labelled into positive and negative. We employed distant supervision and self-training approaches into the corpus to annotate it. Besides, we release an 8K tweets manually annotated as a gold standard. We evaluated the corpus intrinsically by comparing it to human classification and pre-trained sentiment analysis models. Moreover, we apply extrinsic evaluation methods exploiting sentiment analysis task and achieve an accuracy of 86%.}, booktitle = {Proceedings of the 4th Workshop on Open-Source Arabic Corpora and Processing Tools with a Shared Task on Offensive Language Detection (OSACT4-2020) at Language Resources and Evaluation Conference (LREC 2020), Marseille, 11–16 May 2020 / Hend Al-Khalifa, Walid Magdy, Kareem Darwish, Tamer Elsayed, Hamdy Mubarak (Editors)}, author = {Qwaider (abu kwaik), Chatrine (kathrein) and Chatzikyriakidis, Stergios and Dobnik, Simon and Johansson, Richard and Saad, Motaz}, year = {2020}, publisher = {European Language Resources Association (ELRA)}, address = {Marseille, France}, ISBN = {979-10-95546-51-1}, } @inProceedings{johansson-adesam-2020-training-293365, title = {Training a Swedish Constituency Parser on Six Incompatible Treebanks}, abstract = {We investigate a transition-based parser that usesEukalyptus, a function-tagged constituent treebank for Swedish which includesdiscontinuous constituents. In addition, we show that the accuracy of this parser can be improved by using a multitask learning architecture that makes it possible to train the parser on additional treebanks that use other annotation models.}, booktitle = {Proceedings of the 12th International Conference on Language Resources and Evaluation (LREC 2020)}, author = {Johansson, Richard and Adesam, Yvonne}, year = {2020}, publisher = {European Language Resources Association (ELRA)}, } @article{sandberg-etal-2019-issue-285614, title = {Issue Salience on Twitter During Swedish Party Leaders’ Debates }, abstract = {The objective of this study is to contribute knowledge about formation of political agendas on Twitter during mediated political events, using the party leaders’ debates in Sweden before the general election of 2014 as a case study. Our findings show that issues brought up during the debates were largely mirrored on Twitter, with one striking discrepancy. Contrary to our expectations, issues on the left-right policy dimension were more salient on Twitter than in the debates, whereas issues such as the environment, immigration and refugees, all tied to a liberal-authoritarian value axis, were less salient on Twitter.}, journal = {Nordicom Review}, author = {Sandberg, Linn and Bjereld, Ulf and Bunyik, Karina and Forsberg, Markus and Johansson, Richard}, year = {2019}, volume = {40}, number = {2}, pages = {49--61}, } @article{mogren-johansson-2019-character-285612, title = {Character-based Recurrent Neural Networks for Morphological Relational Reasoning}, abstract = {We present a model for predicting inflected word forms based on morphological analogies. Previous work includes rule-based algorithms that determine and copy affixes from one word to another, with limited support for varying inflectional patterns. In related tasks such as morphological reinflection, the algorithm is provided with an explicit enumeration of morphological features which may not be available in all cases. In contrast, our model is feature-free: instead of explicitly representing morphological features, the model is given a demo pair that implicitly specifies a morphological relation (such as write:writes specifying infinitive:present). Given this demo relation and a query word (e.g. watch), the model predicts the target word (e.g. watches). To address this task, we devise a character-based recurrent neural network architecture using three separate encoders and one decoder. Our experimental evaluation on five different languages shows tha the exact form can be predicted with high accuracy, consistently beating the baseline methods. Particularly, for English the prediction accuracy is 95.60%. The solution is not limited to copying affixes from the demo relation, but generalizes to words with varying inflectional patterns, and can abstract away from the orthographic level to the level of morphological forms.}, journal = {Journal of Language Modeling}, author = {Mogren, Olof and Johansson, Richard}, year = {2019}, volume = {7}, number = {1}, pages = {93--124}, } @inProceedings{adesam-etal-2018-eukalyptus-273839, title = {The Eukalyptus Treebank of Written Swedish}, booktitle = {Seventh Swedish Language Technology Conference (SLTC), Stockholm, 7–9 November 2018}, author = {Adesam, Yvonne and Bouma, Gerlof and Johansson, Richard and Borin, Lars and Forsberg, Markus}, year = {2018}, } @inProceedings{adesam-etal-2018-koala-273841, title = {The Koala Part-of-Speech and Morphological Tagset for Swedish}, booktitle = {Seventh Swedish Language Technology Conference (SLTC), Stockholm, 7-9 November, 2018}, author = {Adesam, Yvonne and Bouma, Gerlof and Johansson, Richard}, year = {2018}, } @inProceedings{fares-etal-2018-2018-272105, title = {The 2018 Shared Task on Extrinsic Parser Evaluation: On the Downstream Utility of English Universal Dependency Parsers}, abstract = {We summarize empirical results and tentative conclusions from the Second Extrinsic Parser Evaluation Initiative (EPE 2018). We review the basic task setup, downstream applications involved, and end-to-end results for seventeen participating parsers. Based on both quantitative and qualitative analysis, we correlate intrinsic evaluation results at different layers of morph-syntactic analysis with observed downstream behavior.}, booktitle = {Proceedings of the CoNLL 2018 Shared Task: Multilingual Parsing from Raw Text to Universal Dependencies}, author = {Fares, Murhaf and Oepen, Stephan and Øvrelid, Lilja and Björne, Jari and Johansson, Richard}, year = {2018}, publisher = {Association for Computational Linguistics}, } @inProceedings{nietopina-johansson-2018-automatically-270261, title = {Automatically Linking Lexical Resources with Word Sense Embedding Models}, abstract = {Automatically learnt word sense embeddings are developed as an attempt to refine the capabilities of coarse word embeddings. The word sense representations obtained this way are, however, sensitive to underlying corpora and parameterizations, and they might be difficult to relate to word senses as formally defined by linguists. We propose to tackle this problem by devising a mechanism to establish links between word sense embeddings and lexical resources created by experts. We evaluate the applicability of these links in a task to retrieve instances of Swedish word senses not present in the lexicon.}, booktitle = {The Third Workshop on Semantic Deep Learning (SemDeep-3), August 20th, 2018, Santa Fe, New Mexico, USA / Luis Espinosa Anke, Thierry Declerck, Dagmar Gromann (eds.)}, author = {Nieto Piña, Luis and Johansson, Richard}, year = {2018}, ISBN = {978-1-948087-56-8}, } @inProceedings{oepen-etal-2017-2017-264156, title = {The 2017 Shared Task on Extrinsic Parser Evaluation. Towards a Reusable Community Infrastructure}, abstract = {The 2017 Shared Task on Extrinsic Parser Evaluation (EPE 2017) seeks to provide better estimates of the relative utility of different types of dependency representa- tions for a variety of downstream applica- tions that depend centrally on the analysis of grammatical structure. EPE 2017 de- fi nes a generalized notion of lexicalized syntactico-semantic dependency represen- tations and provides a common interchange format to three state-of-the-art downstream applications, viz. biomedical event extrac- tion, negation resolution, and fi ne-grained opinion analysis. As a fi rst step towards building a generic and extensible infras- tructure for extrinsic parser evaluation, the downstream applications have been gener- alized to support a broad range of diverese dependency representations (including di- vergent sentence and token boundaries) and to allow fully automated re-training and evaluation for a speci fi c collection of parser outputs. Nine teams participated in EPE 2017, submitting 49 distinct runs that encompass many different families of dependency representations, distinct ap- proaches to preprocessing and parsing, and various types and volumes of training data.}, booktitle = {Proceedings of the 2017 Shared Task on Extrinsic Parser Evaluation at the Fourth International Conference on Dependency Linguistics and the 15th International Conference on Parsing Technologies}, author = {Oepen, Stephan and Øvrelid, Lilja and Björne, Jari and Johansson, Richard and Lapponi, Emanuele and Ginter, Filip and Velldal, Erik}, year = {2017}, publisher = {Association for Computational Linguistics (ACL)}, address = {Stroudsburg, USA}, ISBN = {978-1-945626-74-6}, } @inProceedings{johansson-2017-2017-264160, title = {EPE 2017: The Trento–Gothenburg Opinion Extraction System}, abstract = {We give an overview of one of the three downstream systems in the Extrin- sic Parser Evaluation shared task of 2017: the Trento–Gothenburg system for opin- ion extraction. We describe the modi fi ca- tions required to make the system agnos- tic to its input dependency representation, and discuss how the input affects the vari- ous submodules of the system. The results of the EPE shared task are presented and discussed, and to get a more detailed un- derstanding of the effects of the dependen- cies we run two of the submodules sepa- rately. The results suggest that the module where the effects are strongest is the opin- ion holder extraction module, which can be explained by the fact that this module uses several dependency-based features. For the other modules, the effects are hard to measure.}, booktitle = {Proceedings of the 2017 Shared Task on Extrinsic Parser Evaluation at the Fourth International Conference on Dependency Linguistics and the 15th International Conference on Parsing Technologies}, author = {Johansson, Richard}, year = {2017}, publisher = {Association for Computational Linguistics (ACL) }, address = {Stroudsburg, USA}, ISBN = {978-1-945626-74-6 }, } @inProceedings{nietopina-johansson-2017-training-261938, title = {Training Word Sense Embeddings With Lexicon-based Regularization}, abstract = {We propose to improve word sense embeddings by enriching an automatic corpus-based method with lexicographic data. Information from a lexicon is introduced into the learning algorithm’s objective function through a regularizer. The incorporation of lexicographic data yields embeddings that are able to reflect expertdefined word senses, while retaining the robustness, high quality, and coverage of automatic corpus-based methods. These properties are observed in a manual inspection of the semantic clusters that different degrees of regularizer strength create in the vector space. Moreover, we evaluate the sense embeddings in two downstream applications: word sense disambiguation and semantic frame prediction, where they outperform simpler approaches. Our results show that a corpusbased model balanced with lexicographic data learns better representations and improve their performance in downstream tasks}, booktitle = {Proceedings of the Eighth International Joint Conference on Natural Language Processing (Volume 1: Long Papers), Taipei, Taiwan, November 27 – December 1, 2017}, author = {Nieto Piña, Luis and Johansson, Richard}, year = {2017}, publisher = {Asian Federation of Natural Language Processing }, ISBN = {978-1-948087-00-1}, } @inProceedings{adouane-etal-2017-romanized-252493, title = {Romanized Arabic and Berber Detection Using PPM and Dictionary Methods}, abstract = {Arabic is one of the Semitic languages written in Arabic script in its standard form. However, the recent rise of social media and new technologies has contributed considerably to the emergence of a new form of Arabic, namely Arabic written in Latin scripts, often called Romanized Arabic or Arabizi. While Romanized Arabic is an informal language, Berber or Tamazight uses Latin script in its standard form with some orthography differences depending on the country it is used in. Both these languages are under-resourced and unknown to the state-of-the-art language identifiers. In this paper, we present a language automatic identifier for both Romanized Arabic and Romanized Berber. We also describe the built linguistic resources (large dataset and lexicons) including a wide range of Arabic dialects (Algerian, Egyptian, Gulf, Iraqi, Levantine, Moroccan and Tunisian dialects) as well as the most popular Berber varieties (Kabyle, Tashelhit, Tarifit, Tachawit and Tamzabit). We use the Prediction by Partial Matching (PPM) and dictionary-based methods. The methods reach a macro-average F-Measure of 98.74% and 97.60% respectively.}, booktitle = {13th ACS/IEEE International Conference on Computer Systems and Applications AICCSA 2016}, author = {Adouane, Wafia and Semmar, Nasredine and Johansson, Richard}, year = {2017}, address = {Morocco}, ISBN = { 978-150904320-0}, } @inProceedings{adouane-etal-2016-romanized-255457, title = {Romanized Arabic and Berber Detection Using Prediction by Partial Matching and Dictionary Methods}, abstract = {Arabic is one of the Semitic languages written in Arabic script in its standard form. However, the recent rise of social media and new technologies has contributed considerably to the emergence of a new form of Arabic, namely Arabic written in Latin scripts, often called Romanized Arabic or Arabizi. While Romanized Arabic is an informal language, Berber or Tamazight uses Latin script in its standard form with some orthography differences depending on the country it is used in. Both these languages are under-resourced and unknown to the state-of-theart language identifiers. In this paper, we present a language automatic identifier for both Romanized Arabic and Romanized Berber. We also describe the built linguistic resources (large dataset and lexicons) including a wide range of Arabic dialects (Algerian, Egyptian, Gulf, Iraqi, Levantine, Moroccan and Tunisian dialects) as well as the most popular Berber varieties (Kabyle, Tashelhit, Tarifit, Tachawit and Tamzabit). We use the Prediction by Partial Matching (PPM) and dictionary-based methods. The methods reach a macro-average F-Measure of 98.74% and 97.60% respectively.}, booktitle = {2016 IEEE/ACS 13TH INTERNATIONAL CONFERENCE OF COMPUTER SYSTEMS AND APPLICATIONS (AICCSA)}, author = {Adouane, Wafia and Semmar, N. and Johansson, Richard}, year = {2016}, ISBN = {978-1-5090-4320-0}, } @article{dupplaw-etal-2014-information-195563, title = {Information extraction from multimedia web documents: an open-source platform and testbed}, abstract = {The LivingKnowledge project aimed to enhance the current state of the art in search, retrieval and knowledge management on the web by advancing the use of sentiment and opinion analysis within multimedia applications. To achieve this aim, a diverse set of novel and complementary analysis techniques have been integrated into a single, but extensible software platform on which such applications can be built. The platform combines state-of-the-art techniques for extracting facts, opinions and sentiment from multimedia documents, and unlike earlier platforms, it exploits both visual and textual techniques to support multimedia information retrieval. Foreseeing the usefulness of this software in the wider community, the platform has been made generally available as an open-source project. This paper describes the platform design, gives an overview of the analysis algorithms integrated into the system and describes two applications that utilise the system for multimedia information retrieval.}, journal = {International Journal of Multimedia Information Retrieval}, author = {Dupplaw, David and Matthews, Michael and Johansson, Richard and Boato, Giulia and Costanzo, Andrea and Fontani, Marco and Minack, Enrico and Demidova, Elena and Blanco, Roi and Griffiths, Thomas and Lewis, Paul and Hare, Jonathon and Moschitti, Alessandro}, year = {2014}, volume = {3}, number = {2}, pages = {97--111}, } @inProceedings{adouane-etal-2016-arabicized-252492, title = {Arabicized and Romanized Berber Automatic Identification}, abstract = {We present an automatic language identification tool for both Arabicized Berber (Berber written in the Arabic script) and Romanized Berber (Berber written in the Latin script). The focus is on short texts (social media content). We use supervised machine learning method with character and word-based n-gram models as features. We also describe the corpora used in this paper. For both Arabicized and Romanized Berber, character-based 5-grams score the best giving an F-score of 99.50%.}, booktitle = {Proceedings of TICAM 2016}, author = {Adouane, Wafia and Semmar, Nasredine and Johansson, Richard}, year = {2016}, publisher = {IRCAM}, address = {Morocco}, } @inProceedings{adouane-etal-2016-romanized-246849, title = {Romanized Berber and Romanized Arabic Automatic Language Identification Using Machine Learning}, abstract = {The identification of the language of text/speech input is the first step to be able to properly do any language-dependent natural language processing. The task is called Automatic Language Identification (ALI). Being a well-studied field since early 1960’s, various methods have been applied to many standard languages. The ALI standard methods require datasets for training and use character/word-based n-gram models. However, social media and new technologies have contributed to the rise of informal and minority languages on the Web. The state-of-the-art automatic language identifiers fail to properly identify many of them. Romanized Arabic (RA) and Romanized Berber (RB) are cases of these informal languages which are under-resourced. The goal of this paper is twofold: detect RA and RB, at a document level, as separate languages and distinguish between them as they coexist in North Africa. We consider the task as a classification problem and use supervised machine learning to solve it. For both languages, character-based 5-grams combined with additional lexicons score the best, F-score of 99.75% and 97.77% for RB and RA respectively.}, booktitle = {Proceedings of the Third Workshop on NLP for Similar Languages, Varieties and Dialects; 53–61; December 12, 2016 ; Osaka, Japan}, author = {Adouane, Wafia and Semmar, Nasredine and Johansson, Richard}, year = {2016}, publisher = {Association for Computational Linguistics}, } @article{nietopina-johansson-2016-benchmarking-251412, title = {Benchmarking Word Sense Disambiguation Systems for Swedish}, abstract = {We compare several word sense disambiguation systems for Swedish and evaluate them on seven different sense-annotated corpora. Our results show that unsupervised systems beat a random baseline, but generally do not outperform a first-sense baseline considerably. On a lexical-sample dataset that allows us to train a supervised system, the unsupervised disambiguators are strongly outperformed by the supervised one.}, journal = {The Sixth Swedish Language Technology Conference}, author = {Nieto Piña, Luis and Johansson, Richard}, year = {2016}, } @inProceedings{adouane-etal-2016-asirem-246853, title = {ASIREM Participation at the Discriminating Similar Languages Shared Task 2016}, booktitle = {Proceedings of the Third Workshop on NLP for Similar Languages, Varieties and Dialects; 163–169; December 12; Osaka, Japan}, author = {Adouane, Wafia and Semmar, Nasredine and Johansson, Richard}, year = {2016}, } @inProceedings{adouane-etal-2016-automatic-246765, title = {Automatic Detection of Arabicized Berber and Arabic Varieties}, abstract = {Automatic Language Identification (ALI) is the detection of the natural language of an input text by a machine. It is the first necessary step to do any language-dependent natural language processing task. Various methods have been successfully applied to a wide range of languages, and the state-of-the-art automatic language identifiers are mainly based on character n-gram models trained on huge corpora. However, there are many languages which are not yet automatically processed, for instance minority and informal languages. Many of these languages are only spoken and do not exist in a written format. Social media platforms and new technologies have facilitated the emergence of written format for these spoken languages based on pronunciation. The latter are not well represented on the Web, commonly referred to as under-resourced languages, and the current available ALI tools fail to properly recognize them. In this paper, we revisit the problem of ALI with the focus on Arabicized Berber and dialectal Arabic short texts. We introduce new resources and evaluate the existing methods. The results show that machine learning models combined with lexicons are well suited for detecting Arabicized Berber and different Arabic varieties and distinguishing between them, giving a macro-average F-score of 92.94%.}, booktitle = {Proceedings of the Third Workshop on NLP for Similar Languages, Varieties and Dialects; 63–72; December 12; Osaka, Japan}, author = {Adouane, Wafia and Semmar, Nasredine and Johansson, Richard and Bobicev, Victoria}, year = {2016}, } @article{tahmasebi-etal-2015-visions-212969, title = {Visions and open challenges for a knowledge-based culturomics}, abstract = {The concept of culturomics was born out of the availability of massive amounts of textual data and the interest to make sense of cultural and language phenomena over time. Thus far however, culturomics has only made use of, and shown the great potential of, statistical methods. In this paper, we present a vision for a knowledge-based culturomics that complements traditional culturomics. We discuss the possibilities and challenges of combining knowledge-based methods with statistical methods and address major challenges that arise due to the nature of the data; diversity of sources, changes in language over time as well as temporal dynamics of information in general. We address all layers needed for knowledge-based culturomics, from natural language processing and relations to summaries and opinions.}, journal = {International Journal on Digital Libraries}, author = {Tahmasebi, Nina and Borin, Lars and Capannini, Gabriele and Dubhashi, Devdatt and Exner, Peter and Forsberg, Markus and Gossen, Gerhard and Johansson, Fredrik and Johansson, Richard and Kågebäck, Mikael and Mogren, Olof and Nugues, Pierre and Risse, Thomas}, year = {2015}, volume = {15}, number = {2-4}, pages = {169--187}, } @inProceedings{borin-etal-2013-mining-188846, title = {Mining semantics for culturomics: towards a knowledge-based approach}, abstract = {The massive amounts of text data made available through the Google Books digitization project have inspired a new field of big-data textual research. Named culturomics, this field has attracted the attention of a growing number of scholars over recent years. However, initial studies based on these data have been criticized for not referring to relevant work in linguistics and language technology. This paper provides some ideas, thoughts and first steps towards a new culturomics initiative, based this time on Swedish data, which pursues a more knowledge-based approach than previous work in this emerging field. The amount of new Swedish text produced daily and older texts being digitized in cultural heritage projects grows at an accelerating rate. These volumes of text being available in digital form have grown far beyond the capacity of human readers, leaving automated semantic processing of the texts as the only realistic option for accessing and using the information contained in them. The aim of our recently initiated research program is to advance the state of the art in language technology resources and methods for semantic processing of Big Swedish text and focus on the theoretical and methodological advancement of the state of the art in extracting and correlating information from large volumes of Swedish text using a combination of knowledge-based and statistical methods.}, booktitle = {2013 ACM International Workshop on Mining Unstructured Big Data Using Natural Language Processing, UnstructureNLP 2013, Held at 22nd ACM International Conference on Information and Knowledge Management, CIKM 2013; San Francisco, CA; United States; 28 October 2013 through 28 October 2013}, author = {Borin, Lars and Dubhashi, Devdatt and Forsberg, Markus and Johansson, Richard and Kokkinakis, Dimitrios and Nugues, Pierre}, year = {2013}, ISBN = {978-1-4503-2415-1}, pages = {3--10}, } @article{forsberg-etal-2014-from-208123, title = {From construction candidates to constructicon entries: An experiment using semi-automatic methods for identifying constructions in corpora}, abstract = { We present an experiment where natural language processing tools are used to automatically identify potential constructions in a corpus. e experiment was conducted as part of the ongoing efforts to develop a Swedish constructicon. Using an automatic method to suggest constructions has advantages not only for efficiency but also methodologically: it forces the analyst to look more objec-tively at the constructions actually occurring in corpora, as opposed to focusing on “interesting” constructions only. As a heuristic for identifying potential con-structions, the method has proved successful, yielding about 200 (out of 1,200) highly relevant construction candidates.}, journal = {Constructions and Frames}, author = {Forsberg, Markus and Johansson, Richard and Bäckström, Linnéa and Borin, Lars and Lyngfelt, Benjamin and Olofsson, Joel and Prentice, Julia}, year = {2014}, volume = {6}, number = {1, 2014}, pages = {114--135}, } @inProceedings{ghanimifard-johansson-2015-enriching-222749, title = {Enriching Word-sense Embeddings with Translational Context}, abstract = {Vector-space models derived from corpora are an effective way to learn a representation of word meaning directly from data, and these models have many uses in practical applications. A number of unsupervised approaches have been proposed to automatically learn representations of word senses directly from corpora, but since these methods use no information but the words themselves, they sometimes miss distinctions that could be possible to make if more information were available. In this paper, we present a general framework that we call context enrichment that incorporates external information during the training of multi-sense vector-space models. Our approach is agnostic as to which external signal is used to enrich the context, but in this work we consider the use of translations as the source of enrichment. We evaluated the models trained using the translation-enriched context using several similarity benchmarks and a word analogy test set. In all our evaluations, the enriched model outperformed the purely word-based baseline soundly. }, booktitle = {Proceedings of Recent Advances in Natural Language Processing}, editor = {Galia Angelova and Kalina Bontcheva and Ruslan Mitkov. International Conference and Hissar and Bulgaria 7–9 September and 2015}, author = {Ghanimifard, Mehdi and Johansson, Richard}, year = {2015}, pages = {208--215}, } @inProceedings{adesam-etal-2015-multiwords-228833, title = {Multiwords, Word Senses and Multiword Senses in the Eukalyptus Treebank of Written Swedish}, abstract = {Multiwords reside at the intersection of the lexicon and syntax and in an annotation project, they will affect both levels. In the Eukalyptus treebank of written Swedish, we treat multiwords formally as syntactic objects, which are assigned a lexical type and sense. With the help of a simple dichotomy, analyzed vs unanalyzed multiwords, and the expressiveness of the syntactic annotation formalism employed, we are able to flexibly handle most multiword types and usages.}, booktitle = {Proceedings of the Fourteenth International Workshop on Treebanks and Linguistic Theories (TLT14), 11–12 December 2015 Warsaw, Poland}, author = {Adesam, Yvonne and Bouma, Gerlof and Johansson, Richard}, year = {2015}, ISBN = {978-83-63159-18-4}, pages = {3--12}, } @inProceedings{nietopina-johansson-2015-simple-222611, title = {A Simple and Efficient Method to Generate Word Sense Representations}, abstract = {Distributed representations of words have boosted the performance of many Natural Language Processing tasks. However, usually only one representation per word is obtained, not acknowledging the fact that some words have multiple meanings. This has a negative effect on the individual word representations and the language model as a whole. In this paper we present a simple model that enables recent techniques for building word vectors to represent distinct senses of polysemic words. In our assessment of this model we show that it is able to effectively discriminate between words’ senses and to do so in a computationally efficient manner.}, booktitle = {Proceedings of International Conference in Recent Advances in Natural Language Processing}, editor = {Galia Angelova and Kalina Bontcheva and Ruslan Mitkov and Hissar and Bulgaria 7–9 September and 2015}, author = {Nieto Piña, Luis and Johansson, Richard}, year = {2015}, pages = {465--472}, } @inProceedings{adesam-etal-2015-defining-217815, title = {Defining the Eukalyptus forest – the Koala treebank of Swedish}, abstract = {This paper details the design of the lexical and syntactic layers of a new annotated corpus of Swedish contemporary texts. In order to make the corpus adaptable into a variety of representations, the annotation is of a hybrid type with head-marked constituents and function-labeled edges, and with a rich annotation of non-local dependencies. The source material has been taken from public sources, to allow the resulting corpus to be made freely available.}, booktitle = {Proceedings of the 20th Nordic Conference of Computational Linguistics, NODALIDA 2015, May 11-13, 2015, Vilnius, Lithuania. Edited by Beáta Megyesi}, author = {Adesam, Yvonne and Bouma, Gerlof and Johansson, Richard}, year = {2015}, ISBN = {978-91-7519-098-3}, pages = {1--9}, } @inProceedings{johansson-nietopina-2015-embedding-217863, title = {Embedding a Semantic Network in a Word Space}, abstract = {We present a framework for using continuous- space vector representations of word meaning to derive new vectors representing the meaning of senses listed in a semantic network. It is a post-processing approach that can be applied to several types of word vector representations. It uses two ideas: first, that vectors for polysemous words can be decomposed into a convex combination of sense vectors; secondly, that the vector for a sense is kept similar to those of its neighbors in the network.This leads to a constrained optimization problem, and we present an approximation for the case when the distance function is the squared Euclidean. We applied this algorithm on a Swedish semantic network, and we evaluate the quality of the resulting sense representations extrinsically by showing that they give large improvements when used in a classifier that creates lexical units for FrameNet frames. }, booktitle = {Proceedings of the 2015 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies. Denver, United States, May 31 – June 5, 2015}, author = {Johansson, Richard and Nieto Piña, Luis}, year = {2015}, ISBN = {978-1-941643-49-5}, pages = {1428--1433}, } @inProceedings{kageback-etal-2015-neural-217864, title = {Neural context embeddings for automatic discovery of word senses}, abstract = {Word sense induction (WSI) is the problem of automatically building an inventory of senses for a set of target words using only a text corpus. We introduce a new method for embedding word instances and their context, for use in WSI. The method, Instance-context embedding (ICE), leverages neural word embeddings, and the correlation statistics they capture, to compute high quality embeddings of word contexts. In WSI, these context embeddings are clustered to find the word senses present in the text. ICE is based on a novel method for combining word embeddings using continuous Skip-gram, based on both se- mantic and a temporal aspects of context words. ICE is evaluated both in a new system, and in an extension to a previous system for WSI. In both cases, we surpass previous state-of-the-art, on the WSI task of SemEval-2013, which highlights the generality of ICE. Our proposed system achieves a 33% relative improvement.}, booktitle = {Proceedings of the 1st Workshop on Vector Space Modeling for Natural Language Processing. Denver, United States}, author = {Kågebäck, Mikael and Johansson, Fredrik and Johansson, Richard and Dubhashi, Devdatt}, year = {2015}, pages = {25--32}, } @inProceedings{borin-etal-2015-here-217351, title = {Here be dragons? The perils and promises of inter-resource lexical-semantic mapping}, abstract = {Lexical-semantic knowledges sources are a stock item in the language technologist’s toolbox, having proved their practical worth in many and diverse natural language processing (NLP) applications. In linguistics, lexical semantics comes in many flavors, but in the NLP world, wordnets reign more or less supreme. There has been some promising work utilizing Roget-style thesauruses instead, but wider experimentation is hampered by the limited availability of such resources. The work presented here is a first step in the direction of creating a freely available Roget-style lexical resource for modern Swedish. Here, we explore methods for automatic disambiguation of interresource mappings with the longer-term goal of utilizing similar techniques for automatic enrichment of lexical-semantic resources.}, booktitle = {Linköping Electronic Conference Proceedings. Semantic resources and semantic annotation for Natural Language Processing and the Digital Humanities. Workshop at NODALIDA , May 11, 13-18 2015, Vilnius}, author = {Borin, Lars and Nieto Piña, Luis and Johansson, Richard}, year = {2015}, volume = {112}, ISBN = {978-91-7519-049-5}, pages = {1--11}, } @inProceedings{johansson-nietopina-2015-combining-216865, title = {Combining Relational and Distributional Knowledge for Word Sense Disambiguation}, abstract = {We present a new approach to word sense disambiguation derived from recent ideas in distributional semantics. The input to the algorithm is a large unlabeled corpus and a graph describing how senses are related; no sense-annotated corpus is needed. The fundamental idea is to embed meaning representations of senses in the same continuous-valued vector space as the representations of words. In this way, the knowledge encoded in the lexical resource is combined with the infor- mation derived by the distributional methods. Once this step has been carried out, the sense representations can be plugged back into e.g. the skip-gram model, which allows us to compute scores for the different possible senses of a word in a given context. We evaluated the new word sense disambiguation system on two Swedish test sets annotated with senses defined by the SALDO lexical resource. In both evaluations, our system soundly outperformed random and first-sense baselines. Its accuracy was slightly above that of a well- known graph-based system, while being computationally much more efficient,}, booktitle = {Proceedings of the 20th Nordic Conference of Computational Linguistics, May 12-13, Vilnius, Lithuania. Linköping Electronic Conference Proceedings 109, Linköping University Electronic Press..}, author = {Johansson, Richard and Nieto Piña, Luis}, year = {2015}, ISBN = {978-91-7519-098-3}, pages = {69--78}, } @inProceedings{pilan-etal-2014-rule-210940, title = {Rule-based and machine learning approaches for second language sentence-level readability}, abstract = {We present approaches for the identification of sentences understandable by second language learners of Swedish, which can be used in automatically generated exercises based on corpora. In this work we merged methods and knowledge from machine learning-based readability research, from rule-based studies of Good Dictionary Examples and from second language learning syllabuses. The proposed selection methods have also been implemented as a module in a free web-based language learning platform. Users can use different parameters and linguistic filters to personalize their sentence search with or without a machine learning component assessing readability. The sentences selected have already found practical use as multiple-choice exercise items within the same platform. Out of a number of deep linguistic indicators explored, we found mainly lexical-morphological and semantic features informative for second language sentence-level readability. We obtained a readability classification accuracy result of 71%, which approaches the performance of other models used in similar tasks. Furthermore, during an empirical evaluation with teachers and students, about seven out of ten sentences selected were considered understandable, the rule-based approach slightly outperforming the method incorporating the machine learning model.}, booktitle = {Proceedings of the Ninth Workshop on Innovative Use of NLP for Building Educational Applications, June 26, 2014 Baltimore, Maryland, USA}, author = {Pilán, Ildikó and Volodina, Elena and Johansson, Richard}, year = {2014}, ISBN = {978-1-941643-03-7}, pages = {174----184}, } @inProceedings{ahlberg-etal-2014-swedish-210083, title = {Swedish FrameNet++ The Beginning of the End and the End of the Beginning}, booktitle = {Proceedings of the Fifth Swedish Language Technology Conference, Uppsala, 13-14 November 2014}, author = {Ahlberg, Malin and Borin, Lars and Dannélls, Dana and Forsberg, Markus and Toporowska Gronostaj, Maria and Friberg Heppin, Karin and Johansson, Richard and Kokkinakis, Dimitrios and Olsson, Leif-Jöran and Uppström, Jonatan}, year = {2014}, } @inProceedings{adesam-etal-2014-koala-211376, title = {Koala – Korp’s Linguistic Annotations Developing an infrastructure for text-based research with high-quality annotations}, booktitle = {Proceedings of the Fifth Swedish Language Technology Conference, Uppsala, 13-14 November 2014}, author = {Adesam, Yvonne and Borin, Lars and Bouma, Gerlof and Forsberg, Markus and Johansson, Richard}, year = {2014}, } @inProceedings{gunther-etal-2014-rtrgo-201512, title = {RTRGO: Enhancing the GU-MLT-LT System for Sentiment Analysis of Short Messages}, abstract = {This paper describes the enhancements made to our GU-MLT-LT system (Günther and Furrer, 2013) for the SemEval-2014 re-run of the SemEval-2013 shared task on sentiment analysis in Twitter. The changes include the usage of a Twitter-specific tokenizer, additional features and sentiment lexica, feature weighting and random subspace learning. The improvements result in an increase of 4.18 F-measure points on this year’s Twitter test set, ranking 3rd. }, booktitle = {Proceedings of the 8th International Workshop on Semantic Evaluation (SemEval 2014) August 23-24, 2014 Dublin, Ireland}, author = {Günther, Tobias and Vancoppenolle, Jean and Johansson, Richard}, year = {2014}, ISBN = {978-1-941643-24-2}, pages = {497--502}, } @article{johansson-2014-automatic-201874, title = {Automatic Expansion of the Swedish FrameNet Lexicon}, abstract = {We evaluate several lexicon-based and corpus-based methods to automatically induce new lexical units for the Swedish FrameNet, and we see that the best-performing setup uses a combination of both types of methods. A particular challenge for Swedish is the absence of a lexical resource such as WordNet; however, we show that the semantic network SALDO, which is organized according to lexicographical principles quite different from those of WordNet, is very useful for our purposes.}, journal = {Constructions and Frames}, author = {Johansson, Richard}, year = {2014}, volume = {6}, number = {1}, pages = {92--113}, } @inProceedings{pilan-etal-2013-automatic-188465, title = {Automatic Selection of Suitable Sentences for Language Learning Exercises}, abstract = {In this study we investigated second and foreign language (L2) sentence readability, an area little explored so far in the case of several languages, including Swedish. The outcome of our research consists of two methods for sentence selection from native language corpora based on Natural Language Processing (NLP) and machine learning (ML) techniques. The two approaches have been made available online within Lärka, an Intelligent CALL (ICALL) platform offering activities for language learners and students of linguistics. Such an automatic selection of suitable sentences can be valuable for L2 teachers during the creation of new teaching materials, for L2 students who look for additional self-study exercises as well as for lexicographers in search of example sentences to illustrate the meaning of a vocabulary item. Members from all these potential user groups evaluated our methods and found the majority of the sentences selected suitable for L2 learning purposes.}, booktitle = {20 Years of EUROCALL: Learning from the Past, Looking to the Future. 2013 EUROCALL Conference, 11th to 14th September 2013 Évora, Portugal, Proceedings.}, author = {Pilán, Ildikó and Volodina, Elena and Johansson, Richard}, year = {2013}, ISBN = {978-1-908416-12-4}, pages = {218--225}, } @inProceedings{ghosh-etal-2013-mining-188844, title = {Mining Fine-grained Opinion Expressions with Shallow Parsing}, abstract = {Opinion analysis deals with public opinions and trends, but subjective language is highly ambiguous. In this paper, we follow a simple data-driven technique to learn fine-grained opinions. We select an intersection set of Wall Street Journal documents that is included both in the Penn Discourse Tree Bank (PDTB) and in the Multi-Perspective Question Answering (MPQA) corpus. This is done in order to explore the usefulness of discourse-level structure to facilitate the extraction of fine-grained opinion expressions. Here we perform shallow parsing of MPQA expressions with connective based discourse structure, and then also with Named Entities (NE) and some syntax features using conditional random fields; the latter feature set is basically a collection of NEs and a bundle of features that is proved to be useful in a shallow discourse parsing task. We found that both of the feature-sets are useful to improve our baseline at different levels of this fine-grained opinion expression mining task.}, booktitle = {Proceedings of the International Conference Recent Advances in Natural Language Processing}, author = {Ghosh, Sucheta and Tonelli, Sara and Johansson, Richard}, year = {2013}, pages = {302--310}, } @article{borin-johansson-2014-kulturomik-192931, title = {Kulturomik: Att spana efter språkliga och kulturella förändringar i digitala textarkiv}, journal = {Historia i en digital värld}, author = {Borin, Lars and Johansson, Richard}, year = {2014}, } @article{johansson-moschitti-2013-relational-158811, title = {Relational Features in Fine-grained Opinion Analysis}, abstract = {Fine-grained opinion analysis often makes use of linguistic features but typically does not take the interaction between opinions into account. This article describes a set of experiments that demonstrate that relational features, mainly derived from dependency-syntactic and semantic role structures, can significantly improve the performance of automatic systems for a number of fine-grained opinion analysis tasks: marking up opinion expressions, finding opinion holders, and determining the polarities of opinion expressions. These features make it possible to model the way opinions expressed in natural-language discourse interact in a sentence over arbitrary distances. The use of relations requires us to consider multiple opinions simultaneously, which makes exact inference intractable. However, a reranker can be used as a sufficiently accurate and efficient approximation. A number of feature sets and machine learning approaches for the rerankers are evaluated. For the task of opinion expression extraction, the best model shows a 10-point absolute improvement in soft recall on the MPQA corpus over a conventional sequence labeler based on local contextual features, while precision decreases only slightly. Significant improvements are also seen for the extended tasks where holders and polarities are considered: 10 and 7 points in recall, respectively. In addition, the systems outperform previously published results for unlabeled (6 F-measure points) and polarity-labeled (10–15 points) opinion expression extraction. Finally, as an extrinsic evaluation, the extracted MPQA-style opinion expressions are used in practical opinion mining tasks. In all scenarios considered, the machine learning features derived from the opinion expressions lead to statistically significant improvement.}, journal = {Computational Linguistics}, author = {Johansson, Richard and Moschitti, Alessandro}, year = {2013}, volume = {39}, number = {3}, pages = {473--509}, } @inProceedings{bennaceur-etal-2013-automatic-158812, title = {Automatic Service Categorisation through Machine Learning in Emergent Middleware}, booktitle = {Lecture notes in computer sciences}, author = {Bennaceur, Amel and Johansson, Richard and Moschitti, Alessandro and Sykes, Daniel and Issarny, Valérie}, year = {2013}, volume = {7542}, pages = {133--149}, } @inProceedings{johansson-2013-training-173587, title = {Training Parsers on Incompatible Treebanks}, abstract = {We consider the problem of training a statistical parser in the situation when there are multiple treebanks available, and these treebanks are annotated according to different linguistic conventions. To address this problem, we present two simple adaptation methods: the first method is based on the idea of using a shared feature representation when parsing multiple treebanks, and the second method on guided parsing where the output of one parser provides features for a second one. To evaluate and analyze the adaptation methods, we train parsers on treebank pairs in four languages: German, Swedish, Italian, and English. We see significant improvements for all eight treebanks when training on the full training sets. However, the clearest benefits are seen when we consider smaller training sets. Our experiments were carried out with unlabeled dependency parsers, but the methods can easily be generalized to other feature-based parsers.}, booktitle = {Proceedings of the 2013 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies}, author = {Johansson, Richard}, year = {2013}, pages = {127--137}, } @inProceedings{ju-etal-2013-learning-166990, title = {Learning to Rank from Structures in Hierarchical Text Classification}, abstract = {In this paper, we model learning to rank algorithms based on structural dependencies in hierarchical multi-label text categorization (TC). Our method uses the classification probability of the binary classifiers of a standard top-down approach to generate k-best hypotheses. The latter are generated according to their global probability while at the same time satisfy the structural constraints between father and children nodes. The rank is then refined using Support Vector Machines and tree kernels applied to a structural representation of hypotheses, i.e., a hierarchy tree in which the outcome of binary one-vs-all classifiers is directly marked in its nodes. Our extensive experiments on the whole Reuters Corpus Volume 1 show that our models significantly improve over the state of the art in TC, thanks to the use of structural dependecies.}, booktitle = {Advances in Information Retrieval; 35th European Conference on IR Research, ECIR 2013, Moscow, Russia, March 24-27, 2013; P. Serdyukov et al. (ed)}, author = {Ju, Qi and Moschitti, Alessandro and Johansson, Richard}, year = {2013}, volume = {Lecture Notes in Computer Science 7814}, ISBN = {978-3-642-36972-8}, pages = {183--194}, } @inProceedings{volodina-etal-2012-semi-165961, title = {Semi-automatic selection of best corpus examples for Swedish: Initial algorithm evaluation.}, abstract = {The study presented here describes the results of the initial evaluation of two sorting approaches to automatic ranking of corpus examples for Swedish. Representatives from two potential target user groups have been asked to rate top three hits per approach for sixty search items from the point of view of the needs of their professional target groups, namely second/foreign language (L2) teachers and lexicographers. This evaluation has shown, on the one hand, which of the two approaches to example rating (called in the text below algorithms #1 and #2) performs better in terms of finding better examples for each target user group; and on the other hand, which features evaluators associate with good examples. It has also facilitated statistic analysis of the “good” versus “bad” examples with reference to the measurable features, such as sentence length, word length, lexical frequency profiles, PoS constitution, dependency structure, etc. with a potential to find out new reliable classifiers.}, booktitle = {Proceedings of the SLTC 2012 workshop on NLP for CALL, Lund, 25th October, 2012. }, author = {Volodina, Elena and Johansson, Richard and Johansson Kokkinakis, Sofie}, year = {2012}, number = {080}, pages = {59--70}, } @inProceedings{bennaceur-etal-2012-machine-160393, title = {Machine Learning for Emergent Middleware}, abstract = {Highly dynamic and heterogeneous distributed systems are challenging today's middleware technologies. Existing middleware paradigms are unable to deliver on their most central promise, which is offering interoperability. In this paper, we argue for the need to dynamically synthesise distributed system infrastructures according to the current operating environment, thereby generating "Emergent Middleware'' to mediate interactions among heterogeneous networked systems that interact in an ad hoc way. The paper outlines the overall architecture of Enablers underlying Emergent Middleware, and in particular focuses on the key role of learning in supporting such a process, spanning statistical learning to infer the semantics of networked system functions and automata learning to extract the related behaviours of networked systems.}, booktitle = {Proceedings of the Joint Workshop on Intelligent Methods for Software System Engineering (JIMSE)}, author = {Bennaceur, Amel and Howar, Falk and Issarny, Valérie and Johansson, Richard and Moschitti, Alessandro and Spalazzese, Romina and Steffen, Bernhard and Sykes, Daniel}, year = {2012}, volume = {Accepted}, } @inProceedings{johansson-2012-bridging-163602, title = {Bridging the Gap between Two Different Swedish Treebanks}, abstract = {We present two simple adaptation methods to train a dependency parser in the situation when there are multiple treebanks available, and these treebanks are annotated according to different linguistic conventions. To test the methods, we train parsers on the Talbanken and Syntag treebanks of Swedish. The results show that the methods are effective for low-to-medium training set sizes.}, booktitle = {Proceedings of the Fourth Swedish Language Technology Conference (SLTC)}, author = {Johansson, Richard}, year = {2012}, volume = {Accepted}, } @inProceedings{moschitti-etal-2012-modeling-156401, title = {Modeling Topic Dependencies in Hierarchical Text Categorization}, abstract = {In this paper, we encode topic dependencies in hierarchical multi-label Text Categorization (TC) by means of rerankers. We represent reranking hypotheses with several innovative kernels considering both the structure of the hierarchy and the probability of nodes. Additionally, to better investigate the role of category relationships, we consider two interesting cases: (i) traditional schemes in which node-fathers include all the documents of their child-categories; and (ii) more general schemes, in which children can include documents not belonging to their fathers. The extensive experimentation on Reuters Corpus Volume 1 shows that our rerankers inject effective structural semantic dependencies in multi-classifiers and significantly outperform the state of the art.}, booktitle = {Proceedings of the 50th Annual Meeting of the Association for Computational Linguistics (ACL 2012); Jeju, Korea; July 8-14}, author = {Moschitti, Alessandro and Ju, Qi and Johansson, Richard}, year = {2012}, pages = {759--767}, } @inProceedings{borin-etal-2012-transferring-157213, title = {Transferring Frames: Utilization of Linked Lexical Resources}, abstract = {In our experiment, we evaluate the transferability of frames from Swedish to Finnish in parallel corpora. We evaluate both the theoretical possibility of transferring frames and the possibility of performing it using available lexical resources. We add the frame information to an extract of the Swedish side of the Kotus and JRC-Acquis corpora using an automatic frame labeler and copy it to the Finnish side. We focus on evaluating the results to get an estimation on how often the parallel sentences can be said to express the same frame. This sheds light to the questions: Are the same situations in the two languages expressed using different frames, i.e. are the frames transferable even in theory? How well can the frame information of running text be transferred from language to another? }, booktitle = {Proceedings of the Workshop on Inducing Linguistic Structure Submission (WILS)}, author = {Borin, Lars and Forsberg, Markus and Johansson, Richard and Muhonen, Kristiina and Purtonen, Tanja and Voionmaa, Kaarlo}, year = {2012}, pages = {8--15}, } @inProceedings{ghosh-etal-2012-global-157440, title = {Global Features for Shallow Discourse Parsing}, abstract = {A coherently related group of sentences may be referred to as a discourse. In this paper we address the problem of parsing coherence relations as defined in the Penn Discourse Tree Bank (PDTB). A good model for discourse structure analysis needs to account both for local dependencies at the token-level and for global dependencies and statistics. We present techniques on using inter-sentential or sentence-level (global), data-driven, non-grammatical features in the task of parsing discourse. The parser model follows up previous approach based on using token-level (local) features with conditional random fields for shallow discourse parsing, which is lacking in structural knowledge of discourse. The parser adopts a two-stage approach where first the local constraints are applied and then global constraints are used on a reduced weighted search space (n-best). In the latter stage we experiment with different rerankers trained on the first stage n-best parses, which are generated using lexico-syntactic local features. The two-stage parser yields significant improvements over the best performing model of discourse parser on the PDTB corpus.}, booktitle = {Proceedings of the 13th Annual Meeting of the Special Interest Group on Discourse and Dialogue (SIGDIAL)}, author = {Ghosh, Sucheta and Riccardi, Giuseppe and Johansson, Richard}, year = {2012}, pages = {150--159}, } @inProceedings{borin-etal-2012-search-157338, title = {Search Result Diversification Methods to Assist Lexicographers}, abstract = {We show how the lexicographic task of finding informative and diverse example sentences can be cast as a search result diversification problem, where an objective based on relevance and diversity is maximized. This problem has been studied intensively in the information retrieval community during recent years, and efficient algorithms have been devised. We finally show how the approach has been implemented in a lexicographic project, and describe the relevance and diversity functions used in that context. }, booktitle = {Proceedings of the 6th Linguistic Annotation Workshop}, author = {Borin, Lars and Forsberg, Markus and Friberg Heppin, Karin and Johansson, Richard and Kjellandsson, Annika}, year = {2012}, pages = {113--117}, } @inProceedings{ghosh-etal-2012-improving-156399, title = {Improving the Recall of a Discourse Parser by Constraint-based Postprocessing}, abstract = {We describe two constraint-based methods that can be used to improve the recall of a shallow discourse parser based on conditional random field chunking. These methods use a set of natural structural constraints as well as others that follow from the annotation guidelines of the Penn Discourse Treebank. We evaluated the resulting systems on the standard test set of the PDTB and achieved a rebalancing of precision and recall with improved F-measures across the board. This was especially notable when we used evaluation metrics taking partial matches into account; for these measures, we achieved F-measure improvements of several points.}, booktitle = {Proceedings of the Eighth International Conference on Language Resources and Evaluation (LREC'12); Istanbul, Turkey; May 23-25}, author = {Ghosh, Sucheta and Johansson, Richard and Riccardi, Giuseppe and Tonelli, Sara}, year = {2012}, ISBN = {978-2-9517408-7-7}, pages = {2791--2794}, } @inProceedings{johansson-etal-2012-semantic-156400, title = {Semantic Role Labeling with the Swedish FrameNet}, abstract = {We present the first results on semantic role labeling using the Swedish FrameNet, which is a lexical resource currently in development. Several aspects of the task are investigated, including the selection of machine learning features, the effect of choice of syntactic parser, and the ability of the system to generalize to new frames and new genres. In addition, we evaluate two methods to make the role label classifier more robust: cross-frame generalization and cluster-based features. Although the small amount of training data limits the performance achievable at the moment, we reach promising results. In particular, the classifier that extracts the boundaries of arguments works well for new frames, which suggests that it already at this stage can be useful in a semi-automatic setting.}, booktitle = {Proceedings of the Eighth International Conference on Language Resources and Evaluation (LREC'12); Istanbul, Turkey; May 23-25}, author = {Johansson, Richard and Friberg Heppin, Karin and Kokkinakis, Dimitrios}, year = {2012}, ISBN = {978-2-9517408-7-7}, pages = {3697--3700}, } @inProceedings{ju-etal-2011-towards-151361, title = {Towards Using Reranking in Hierarchical Classification}, abstract = {We consider the use of reranking as a way to relax typical in- dependence assumptions often made in hierarchical multilabel classification. Our reranker is based on (i) an algorithm that generates promising k-best classification hypotheses from the output of local binary classifiers that clas- sify nodes of a target tree-shaped hierarchy; and (ii) a tree kernel-based reranker applied to the classification tree associated with the hypotheses above. We carried out a number of experiments with this model on the Reuters corpus: we firstly show the potential of our algorithm by computing the oracle classification accuracy. This demonstrates that there is a signifi- cant room for potential improvement of the hierarchical classifier. Then, we measured the accuracy achieved by the reranker, which shows a significant performance improvement over the baseline. }, booktitle = {Proceedings of the Joint ECML/PKDD-PASCAL Workshop on Large-Scale Hierarchical Classification; September 5, 2011; Athens, Greece}, author = {Ju, Qi and Johansson, Richard and Moschitti, Alessandro}, year = {2011}, } @inProceedings{ghosh-etal-2011-shallow-151356, title = {Shallow Discourse Parsing with Conditional Random Fields}, abstract = {Parsing discourse is a challenging natural language processing task. In this paper we take a data driven approach to identify arguments of explicit discourse connectives. In contrast to previous work we do not make any assumptions on the span of arguments and consider parsing as a token-level sequence labeling task. We design the argument segmentation task as a cascade of decisions based on conditional random fields (CRFs). We train the CRFs on lexical, syntactic and semantic features extracted from the Penn Discourse Treebank and evaluate feature combinations on the commonly used test split. We show that the best combination of features includes syntactic and semantic features. The comparative error analysis investigates the performance variability over connective types and argument positions.}, booktitle = {Proceedings of 5th International Joint Conference on Natural Language Processing; editors Haifeng Wang and David Yarowsky; Chiang Mai, Thailand; November 8-13, 2011}, author = {Ghosh, Sucheta and Johansson, Richard and Riccardi, Giuseppe and Tonelli, Sara}, year = {2011}, pages = {1071--1079}, }