@article{Sandberg-Linn2019-285614, title = {Issue Salience on Twitter During Swedish Party Leaders’ Debates }, abstract = {The objective of this study is to contribute knowledge about formation of political agendas on Twitter during mediated political events, using the party leaders’ debates in Sweden before the general election of 2014 as a case study. Our findings show that issues brought up during the debates were largely mirrored on Twitter, with one striking discrepancy. Contrary to our expectations, issues on the left-right policy dimension were more salient on Twitter than in the debates, whereas issues such as the environment, immigration and refugees, all tied to a liberal-authoritarian value axis, were less salient on Twitter.}, author = {Sandberg, Linn and Bjereld, Ulf and Bunyik, Karina and Forsberg, Markus and Johansson, Richard}, year = {2019}, volume = {40}, number = {2}, pages = {49--61}, } @article{Mogren-Olof2019-285612, title = {Character-based Recurrent Neural Networks for Morphological Relational Reasoning}, abstract = {We present a model for predicting inflected word forms based on morphological analogies. Previous work includes rule-based algorithms that determine and copy affixes from one word to another, with limited support for varying inflectional patterns. In related tasks such as morphological reinflection, the algorithm is provided with an explicit enumeration of morphological features which may not be available in all cases. In contrast, our model is feature-free: instead of explicitly representing morphological features, the model is given a demo pair that implicitly specifies a morphological relation (such as write:writes specifying infinitive:present). Given this demo relation and a query word (e.g. watch), the model predicts the target word (e.g. watches). To address this task, we devise a character-based recurrent neural network architecture using three separate encoders and one decoder. Our experimental evaluation on five different languages shows tha the exact form can be predicted with high accuracy, consistently beating the baseline methods. Particularly, for English the prediction accuracy is 95.60%. The solution is not limited to copying affixes from the demo relation, but generalizes to words with varying inflectional patterns, and can abstract away from the orthographic level to the level of morphological forms.}, author = {Mogren, Olof and Johansson, Richard}, year = {2019}, volume = {7}, number = {1}, pages = {93--124}, } @inProceedings{Åkerström-Joakim2019-284338, title = {Natural Language Processing in Policy Evaluation: Extracting Policy Conditions from IMF Loan Agreements}, abstract = {Social science researchers often use text as the raw data in investigations: for instance, when investigating the effects of IMF policies on the development of countries under IMF programs, researchers typically encode structured descriptions of the programs using a time-consuming manual effort. Making this process automatic may open up new opportunities in scaling up such investigations. As a first step towards automatizing this coding process, we describe an experiment where we apply a sentence classifier that automatically detects mentions of policy conditions in IMF loan agreements and divides them into different types. The results show that the classifier is generally able to detect the policy conditions, although some types are hard to distinguish.}, booktitle = {Proceedings of the 22nd Nordic Conference on Computational Linguistics; September 30 – October 2; Turku, Finland}, author = {Åkerström, Joakim and Daoud, Adel and Johansson, Richard}, year = {2019}, publisher = {Linköping University Electronic Press}, } @article{Sandberg-Linn2019-285614, title = {Issue Salience on Twitter During Swedish Party Leaders’ Debates }, abstract = {The objective of this study is to contribute knowledge about formation of political agendas on Twitter during mediated political events, using the party leaders’ debates in Sweden before the general election of 2014 as a case study. Our findings show that issues brought up during the debates were largely mirrored on Twitter, with one striking discrepancy. Contrary to our expectations, issues on the left-right policy dimension were more salient on Twitter than in the debates, whereas issues such as the environment, immigration and refugees, all tied to a liberal-authoritarian value axis, were less salient on Twitter.}, author = {Sandberg, Linn and Bjereld, Ulf and Bunyik, Karina and Forsberg, Markus and Johansson, Richard}, year = {2019}, volume = {40}, number = {2}, pages = {49--61}, } @article{Mogren-Olof2019-285612, title = {Character-based Recurrent Neural Networks for Morphological Relational Reasoning}, abstract = {We present a model for predicting inflected word forms based on morphological analogies. Previous work includes rule-based algorithms that determine and copy affixes from one word to another, with limited support for varying inflectional patterns. In related tasks such as morphological reinflection, the algorithm is provided with an explicit enumeration of morphological features which may not be available in all cases. In contrast, our model is feature-free: instead of explicitly representing morphological features, the model is given a demo pair that implicitly specifies a morphological relation (such as write:writes specifying infinitive:present). Given this demo relation and a query word (e.g. watch), the model predicts the target word (e.g. watches). To address this task, we devise a character-based recurrent neural network architecture using three separate encoders and one decoder. Our experimental evaluation on five different languages shows tha the exact form can be predicted with high accuracy, consistently beating the baseline methods. Particularly, for English the prediction accuracy is 95.60%. The solution is not limited to copying affixes from the demo relation, but generalizes to words with varying inflectional patterns, and can abstract away from the orthographic level to the level of morphological forms.}, author = {Mogren, Olof and Johansson, Richard}, year = {2019}, volume = {7}, number = {1}, pages = {93--124}, } @inProceedings{Åkerström-Joakim2019-284338, title = {Natural Language Processing in Policy Evaluation: Extracting Policy Conditions from IMF Loan Agreements}, abstract = {Social science researchers often use text as the raw data in investigations: for instance, when investigating the effects of IMF policies on the development of countries under IMF programs, researchers typically encode structured descriptions of the programs using a time-consuming manual effort. Making this process automatic may open up new opportunities in scaling up such investigations. As a first step towards automatizing this coding process, we describe an experiment where we apply a sentence classifier that automatically detects mentions of policy conditions in IMF loan agreements and divides them into different types. The results show that the classifier is generally able to detect the policy conditions, although some types are hard to distinguish.}, booktitle = {Proceedings of the 22nd Nordic Conference on Computational Linguistics; September 30 – October 2; Turku, Finland}, author = {Åkerström, Joakim and Daoud, Adel and Johansson, Richard}, year = {2019}, publisher = {Linköping University Electronic Press}, } @article{Sandberg-Linn2019-285614, title = {Issue Salience on Twitter During Swedish Party Leaders’ Debates }, abstract = {The objective of this study is to contribute knowledge about formation of political agendas on Twitter during mediated political events, using the party leaders’ debates in Sweden before the general election of 2014 as a case study. Our findings show that issues brought up during the debates were largely mirrored on Twitter, with one striking discrepancy. Contrary to our expectations, issues on the left-right policy dimension were more salient on Twitter than in the debates, whereas issues such as the environment, immigration and refugees, all tied to a liberal-authoritarian value axis, were less salient on Twitter.}, author = {Sandberg, Linn and Bjereld, Ulf and Bunyik, Karina and Forsberg, Markus and Johansson, Richard}, year = {2019}, volume = {40}, number = {2}, pages = {49--61}, } @article{Mogren-Olof2019-285612, title = {Character-based Recurrent Neural Networks for Morphological Relational Reasoning}, abstract = {We present a model for predicting inflected word forms based on morphological analogies. Previous work includes rule-based algorithms that determine and copy affixes from one word to another, with limited support for varying inflectional patterns. In related tasks such as morphological reinflection, the algorithm is provided with an explicit enumeration of morphological features which may not be available in all cases. In contrast, our model is feature-free: instead of explicitly representing morphological features, the model is given a demo pair that implicitly specifies a morphological relation (such as write:writes specifying infinitive:present). Given this demo relation and a query word (e.g. watch), the model predicts the target word (e.g. watches). To address this task, we devise a character-based recurrent neural network architecture using three separate encoders and one decoder. Our experimental evaluation on five different languages shows tha the exact form can be predicted with high accuracy, consistently beating the baseline methods. Particularly, for English the prediction accuracy is 95.60%. The solution is not limited to copying affixes from the demo relation, but generalizes to words with varying inflectional patterns, and can abstract away from the orthographic level to the level of morphological forms.}, author = {Mogren, Olof and Johansson, Richard}, year = {2019}, volume = {7}, number = {1}, pages = {93--124}, } @inProceedings{Åkerström-Joakim2019-284338, title = {Natural Language Processing in Policy Evaluation: Extracting Policy Conditions from IMF Loan Agreements}, abstract = {Social science researchers often use text as the raw data in investigations: for instance, when investigating the effects of IMF policies on the development of countries under IMF programs, researchers typically encode structured descriptions of the programs using a time-consuming manual effort. Making this process automatic may open up new opportunities in scaling up such investigations. As a first step towards automatizing this coding process, we describe an experiment where we apply a sentence classifier that automatically detects mentions of policy conditions in IMF loan agreements and divides them into different types. The results show that the classifier is generally able to detect the policy conditions, although some types are hard to distinguish.}, booktitle = {Proceedings of the 22nd Nordic Conference on Computational Linguistics; September 30 – October 2; Turku, Finland}, author = {Åkerström, Joakim and Daoud, Adel and Johansson, Richard}, year = {2019}, publisher = {Linköping University Electronic Press}, } @inProceedings{NietoPiña-Luis2018-270261, title = {Automatically Linking Lexical Resources with Word Sense Embedding Models}, abstract = {Automatically learnt word sense embeddings are developed as an attempt to refine the capabilities of coarse word embeddings. The word sense representations obtained this way are, however, sensitive to underlying corpora and parameterizations, and they might be difficult to relate to word senses as formally defined by linguists. We propose to tackle this problem by devising a mechanism to establish links between word sense embeddings and lexical resources created by experts. We evaluate the applicability of these links in a task to retrieve instances of Swedish word senses not present in the lexicon.}, booktitle = {The Third Workshop on Semantic Deep Learning (SemDeep-3), August 20th, 2018, Santa Fe, New Mexico, USA / Luis Espinosa Anke, Thierry Declerck, Dagmar Gromann (eds.)}, author = {Nieto Piña, Luis and Johansson, Richard}, year = {2018}, ISBN = {978-1-948087-56-8}, } @inProceedings{Adesam-Yvonne2018-273841, title = {The Koala Part-of-Speech and Morphological Tagset for Swedish}, booktitle = {Seventh Swedish Language Technology Conference (SLTC), Stockholm, 7-9 November, 2018}, author = {Adesam, Yvonne and Bouma, Gerlof and Johansson, Richard}, year = {2018}, } @inProceedings{Adesam-Yvonne2018-273839, title = {The Eukalyptus Treebank of Written Swedish}, booktitle = {Seventh Swedish Language Technology Conference (SLTC), Stockholm, 7–9 November 2018}, author = {Adesam, Yvonne and Bouma, Gerlof and Johansson, Richard and Borin, Lars and Forsberg, Markus}, year = {2018}, } @inProceedings{Fares-Murhaf2018-272105, title = {The 2018 Shared Task on Extrinsic Parser Evaluation: On the Downstream Utility of English Universal Dependency Parsers}, abstract = {We summarize empirical results and tentative conclusions from the Second Extrinsic Parser Evaluation Initiative (EPE 2018). We review the basic task setup, downstream applications involved, and end-to-end results for seventeen participating parsers. Based on both quantitative and qualitative analysis, we correlate intrinsic evaluation results at different layers of morph-syntactic analysis with observed downstream behavior.}, booktitle = {Proceedings of the CoNLL 2018 Shared Task: Multilingual Parsing from Raw Text to Universal Dependencies}, author = {Fares, Murhaf and Oepen, Stephan and Øvrelid, Lilja and Björne, Jari and Johansson, Richard}, year = {2018}, publisher = {Association for Computational Linguistics}, } @inProceedings{Adesam-Yvonne2018-273841, title = {The Koala Part-of-Speech and Morphological Tagset for Swedish}, booktitle = {Seventh Swedish Language Technology Conference (SLTC), Stockholm, 7-9 November, 2018}, author = {Adesam, Yvonne and Bouma, Gerlof and Johansson, Richard}, year = {2018}, } @inProceedings{Adesam-Yvonne2018-273839, title = {The Eukalyptus Treebank of Written Swedish}, booktitle = {Seventh Swedish Language Technology Conference (SLTC), Stockholm, 7–9 November 2018}, author = {Adesam, Yvonne and Bouma, Gerlof and Johansson, Richard and Borin, Lars and Forsberg, Markus}, year = {2018}, } @inProceedings{Fares-Murhaf2018-272105, title = {The 2018 Shared Task on Extrinsic Parser Evaluation: On the Downstream Utility of English Universal Dependency Parsers}, abstract = {We summarize empirical results and tentative conclusions from the Second Extrinsic Parser Evaluation Initiative (EPE 2018). We review the basic task setup, downstream applications involved, and end-to-end results for seventeen participating parsers. Based on both quantitative and qualitative analysis, we correlate intrinsic evaluation results at different layers of morph-syntactic analysis with observed downstream behavior.}, booktitle = {Proceedings of the CoNLL 2018 Shared Task: Multilingual Parsing from Raw Text to Universal Dependencies}, author = {Fares, Murhaf and Oepen, Stephan and Øvrelid, Lilja and Björne, Jari and Johansson, Richard}, year = {2018}, publisher = {Association for Computational Linguistics}, } @inProceedings{Adesam-Yvonne2018-273841, title = {The Koala Part-of-Speech and Morphological Tagset for Swedish}, booktitle = {Seventh Swedish Language Technology Conference (SLTC), Stockholm, 7-9 November, 2018}, author = {Adesam, Yvonne and Bouma, Gerlof and Johansson, Richard}, year = {2018}, } @inProceedings{Adesam-Yvonne2018-273839, title = {The Eukalyptus Treebank of Written Swedish}, booktitle = {Seventh Swedish Language Technology Conference (SLTC), Stockholm, 7–9 November 2018}, author = {Adesam, Yvonne and Bouma, Gerlof and Johansson, Richard and Borin, Lars and Forsberg, Markus}, year = {2018}, } @inProceedings{Fares-Murhaf2018-272105, title = {The 2018 Shared Task on Extrinsic Parser Evaluation: On the Downstream Utility of English Universal Dependency Parsers}, abstract = {We summarize empirical results and tentative conclusions from the Second Extrinsic Parser Evaluation Initiative (EPE 2018). We review the basic task setup, downstream applications involved, and end-to-end results for seventeen participating parsers. Based on both quantitative and qualitative analysis, we correlate intrinsic evaluation results at different layers of morph-syntactic analysis with observed downstream behavior.}, booktitle = {Proceedings of the CoNLL 2018 Shared Task: Multilingual Parsing from Raw Text to Universal Dependencies}, author = {Fares, Murhaf and Oepen, Stephan and Øvrelid, Lilja and Björne, Jari and Johansson, Richard}, year = {2018}, publisher = {Association for Computational Linguistics}, } @inProceedings{Adesam-Yvonne2018-273841, title = {The Koala Part-of-Speech and Morphological Tagset for Swedish}, booktitle = {Seventh Swedish Language Technology Conference (SLTC), Stockholm, 7-9 November, 2018}, author = {Adesam, Yvonne and Bouma, Gerlof and Johansson, Richard}, year = {2018}, } @inProceedings{Adesam-Yvonne2018-273839, title = {The Eukalyptus Treebank of Written Swedish}, booktitle = {Seventh Swedish Language Technology Conference (SLTC), Stockholm, 7–9 November 2018}, author = {Adesam, Yvonne and Bouma, Gerlof and Johansson, Richard and Borin, Lars and Forsberg, Markus}, year = {2018}, } @inProceedings{Fares-Murhaf2018-272105, title = {The 2018 Shared Task on Extrinsic Parser Evaluation: On the Downstream Utility of English Universal Dependency Parsers}, abstract = {We summarize empirical results and tentative conclusions from the Second Extrinsic Parser Evaluation Initiative (EPE 2018). We review the basic task setup, downstream applications involved, and end-to-end results for seventeen participating parsers. Based on both quantitative and qualitative analysis, we correlate intrinsic evaluation results at different layers of morph-syntactic analysis with observed downstream behavior.}, booktitle = {Proceedings of the CoNLL 2018 Shared Task: Multilingual Parsing from Raw Text to Universal Dependencies}, author = {Fares, Murhaf and Oepen, Stephan and Øvrelid, Lilja and Björne, Jari and Johansson, Richard}, year = {2018}, publisher = {Association for Computational Linguistics}, } @inProceedings{Adesam-Yvonne2018-273841, title = {The Koala Part-of-Speech and Morphological Tagset for Swedish}, booktitle = {Seventh Swedish Language Technology Conference (SLTC), Stockholm, 7-9 November, 2018}, author = {Adesam, Yvonne and Bouma, Gerlof and Johansson, Richard}, year = {2018}, } @inProceedings{Adesam-Yvonne2018-273839, title = {The Eukalyptus Treebank of Written Swedish}, booktitle = {Seventh Swedish Language Technology Conference (SLTC), Stockholm, 7–9 November 2018}, author = {Adesam, Yvonne and Bouma, Gerlof and Johansson, Richard and Borin, Lars and Forsberg, Markus}, year = {2018}, } @inProceedings{Fares-Murhaf2018-272105, title = {The 2018 Shared Task on Extrinsic Parser Evaluation: On the Downstream Utility of English Universal Dependency Parsers}, abstract = {We summarize empirical results and tentative conclusions from the Second Extrinsic Parser Evaluation Initiative (EPE 2018). We review the basic task setup, downstream applications involved, and end-to-end results for seventeen participating parsers. Based on both quantitative and qualitative analysis, we correlate intrinsic evaluation results at different layers of morph-syntactic analysis with observed downstream behavior.}, booktitle = {Proceedings of the CoNLL 2018 Shared Task: Multilingual Parsing from Raw Text to Universal Dependencies}, author = {Fares, Murhaf and Oepen, Stephan and Øvrelid, Lilja and Björne, Jari and Johansson, Richard}, year = {2018}, publisher = {Association for Computational Linguistics}, } @inProceedings{Adesam-Yvonne2018-273841, title = {The Koala Part-of-Speech and Morphological Tagset for Swedish}, booktitle = {Seventh Swedish Language Technology Conference (SLTC), Stockholm, 7-9 November, 2018}, author = {Adesam, Yvonne and Bouma, Gerlof and Johansson, Richard}, year = {2018}, } @inProceedings{Adesam-Yvonne2018-273839, title = {The Eukalyptus Treebank of Written Swedish}, booktitle = {Seventh Swedish Language Technology Conference (SLTC), Stockholm, 7–9 November 2018}, author = {Adesam, Yvonne and Bouma, Gerlof and Johansson, Richard and Borin, Lars and Forsberg, Markus}, year = {2018}, } @inProceedings{Fares-Murhaf2018-272105, title = {The 2018 Shared Task on Extrinsic Parser Evaluation: On the Downstream Utility of English Universal Dependency Parsers}, abstract = {We summarize empirical results and tentative conclusions from the Second Extrinsic Parser Evaluation Initiative (EPE 2018). We review the basic task setup, downstream applications involved, and end-to-end results for seventeen participating parsers. Based on both quantitative and qualitative analysis, we correlate intrinsic evaluation results at different layers of morph-syntactic analysis with observed downstream behavior.}, booktitle = {Proceedings of the CoNLL 2018 Shared Task: Multilingual Parsing from Raw Text to Universal Dependencies}, author = {Fares, Murhaf and Oepen, Stephan and Øvrelid, Lilja and Björne, Jari and Johansson, Richard}, year = {2018}, publisher = {Association for Computational Linguistics}, } @inProceedings{Adesam-Yvonne2018-273841, title = {The Koala Part-of-Speech and Morphological Tagset for Swedish}, booktitle = {Seventh Swedish Language Technology Conference (SLTC), Stockholm, 7-9 November, 2018}, author = {Adesam, Yvonne and Bouma, Gerlof and Johansson, Richard}, year = {2018}, } @inProceedings{Adesam-Yvonne2018-273841, title = {The Koala Part-of-Speech and Morphological Tagset for Swedish}, booktitle = {Seventh Swedish Language Technology Conference (SLTC), Stockholm, 7-9 November, 2018}, author = {Adesam, Yvonne and Bouma, Gerlof and Johansson, Richard}, year = {2018}, } @inProceedings{Adesam-Yvonne2018-273839, title = {The Eukalyptus Treebank of Written Swedish}, booktitle = {Seventh Swedish Language Technology Conference (SLTC), Stockholm, 7–9 November 2018}, author = {Adesam, Yvonne and Bouma, Gerlof and Johansson, Richard and Borin, Lars and Forsberg, Markus}, year = {2018}, } @inProceedings{Adesam-Yvonne2018-273839, title = {The Eukalyptus Treebank of Written Swedish}, booktitle = {Seventh Swedish Language Technology Conference (SLTC), Stockholm, 7–9 November 2018}, author = {Adesam, Yvonne and Bouma, Gerlof and Johansson, Richard and Borin, Lars and Forsberg, Markus}, year = {2018}, } @inProceedings{Fares-Murhaf2018-272105, title = {The 2018 Shared Task on Extrinsic Parser Evaluation: On the Downstream Utility of English Universal Dependency Parsers}, abstract = {We summarize empirical results and tentative conclusions from the Second Extrinsic Parser Evaluation Initiative (EPE 2018). We review the basic task setup, downstream applications involved, and end-to-end results for seventeen participating parsers. Based on both quantitative and qualitative analysis, we correlate intrinsic evaluation results at different layers of morph-syntactic analysis with observed downstream behavior.}, booktitle = {Proceedings of the CoNLL 2018 Shared Task: Multilingual Parsing from Raw Text to Universal Dependencies}, author = {Fares, Murhaf and Oepen, Stephan and Øvrelid, Lilja and Björne, Jari and Johansson, Richard}, year = {2018}, publisher = {Association for Computational Linguistics}, } @inProceedings{Fares-Murhaf2018-272105, title = {The 2018 Shared Task on Extrinsic Parser Evaluation: On the Downstream Utility of English Universal Dependency Parsers}, abstract = {We summarize empirical results and tentative conclusions from the Second Extrinsic Parser Evaluation Initiative (EPE 2018). We review the basic task setup, downstream applications involved, and end-to-end results for seventeen participating parsers. Based on both quantitative and qualitative analysis, we correlate intrinsic evaluation results at different layers of morph-syntactic analysis with observed downstream behavior.}, booktitle = {Proceedings of the CoNLL 2018 Shared Task: Multilingual Parsing from Raw Text to Universal Dependencies}, author = {Fares, Murhaf and Oepen, Stephan and Øvrelid, Lilja and Björne, Jari and Johansson, Richard}, year = {2018}, publisher = {Association for Computational Linguistics}, } @inProceedings{Adesam-Yvonne2018-273841, title = {The Koala Part-of-Speech and Morphological Tagset for Swedish}, booktitle = {Seventh Swedish Language Technology Conference (SLTC), Stockholm, 7-9 November, 2018}, author = {Adesam, Yvonne and Bouma, Gerlof and Johansson, Richard}, year = {2018}, } @inProceedings{Adesam-Yvonne2018-273839, title = {The Eukalyptus Treebank of Written Swedish}, booktitle = {Seventh Swedish Language Technology Conference (SLTC), Stockholm, 7–9 November 2018}, author = {Adesam, Yvonne and Bouma, Gerlof and Johansson, Richard and Borin, Lars and Forsberg, Markus}, year = {2018}, } @inProceedings{Fares-Murhaf2018-272105, title = {The 2018 Shared Task on Extrinsic Parser Evaluation: On the Downstream Utility of English Universal Dependency Parsers}, abstract = {We summarize empirical results and tentative conclusions from the Second Extrinsic Parser Evaluation Initiative (EPE 2018). We review the basic task setup, downstream applications involved, and end-to-end results for seventeen participating parsers. Based on both quantitative and qualitative analysis, we correlate intrinsic evaluation results at different layers of morph-syntactic analysis with observed downstream behavior.}, booktitle = {Proceedings of the CoNLL 2018 Shared Task: Multilingual Parsing from Raw Text to Universal Dependencies}, author = {Fares, Murhaf and Oepen, Stephan and Øvrelid, Lilja and Björne, Jari and Johansson, Richard}, year = {2018}, publisher = {Association for Computational Linguistics}, } @inProceedings{Adesam-Yvonne2018-273841, title = {The Koala Part-of-Speech and Morphological Tagset for Swedish}, booktitle = {Seventh Swedish Language Technology Conference (SLTC), Stockholm, 7-9 November, 2018}, author = {Adesam, Yvonne and Bouma, Gerlof and Johansson, Richard}, year = {2018}, } @inProceedings{Adesam-Yvonne2018-273839, title = {The Eukalyptus Treebank of Written Swedish}, booktitle = {Seventh Swedish Language Technology Conference (SLTC), Stockholm, 7–9 November 2018}, author = {Adesam, Yvonne and Bouma, Gerlof and Johansson, Richard and Borin, Lars and Forsberg, Markus}, year = {2018}, } @inProceedings{Fares-Murhaf2018-272105, title = {The 2018 Shared Task on Extrinsic Parser Evaluation: On the Downstream Utility of English Universal Dependency Parsers}, abstract = {We summarize empirical results and tentative conclusions from the Second Extrinsic Parser Evaluation Initiative (EPE 2018). We review the basic task setup, downstream applications involved, and end-to-end results for seventeen participating parsers. Based on both quantitative and qualitative analysis, we correlate intrinsic evaluation results at different layers of morph-syntactic analysis with observed downstream behavior.}, booktitle = {Proceedings of the CoNLL 2018 Shared Task: Multilingual Parsing from Raw Text to Universal Dependencies}, author = {Fares, Murhaf and Oepen, Stephan and Øvrelid, Lilja and Björne, Jari and Johansson, Richard}, year = {2018}, publisher = {Association for Computational Linguistics}, } @inProceedings{Adesam-Yvonne2018-273841, title = {The Koala Part-of-Speech and Morphological Tagset for Swedish}, booktitle = {Seventh Swedish Language Technology Conference (SLTC), Stockholm, 7-9 November, 2018}, author = {Adesam, Yvonne and Bouma, Gerlof and Johansson, Richard}, year = {2018}, } @inProceedings{Adesam-Yvonne2018-273839, title = {The Eukalyptus Treebank of Written Swedish}, booktitle = {Seventh Swedish Language Technology Conference (SLTC), Stockholm, 7–9 November 2018}, author = {Adesam, Yvonne and Bouma, Gerlof and Johansson, Richard and Borin, Lars and Forsberg, Markus}, year = {2018}, } @inProceedings{Fares-Murhaf2018-272105, title = {The 2018 Shared Task on Extrinsic Parser Evaluation: On the Downstream Utility of English Universal Dependency Parsers}, abstract = {We summarize empirical results and tentative conclusions from the Second Extrinsic Parser Evaluation Initiative (EPE 2018). We review the basic task setup, downstream applications involved, and end-to-end results for seventeen participating parsers. Based on both quantitative and qualitative analysis, we correlate intrinsic evaluation results at different layers of morph-syntactic analysis with observed downstream behavior.}, booktitle = {Proceedings of the CoNLL 2018 Shared Task: Multilingual Parsing from Raw Text to Universal Dependencies}, author = {Fares, Murhaf and Oepen, Stephan and Øvrelid, Lilja and Björne, Jari and Johansson, Richard}, year = {2018}, publisher = {Association for Computational Linguistics}, } @inProceedings{Adesam-Yvonne2018-273841, title = {The Koala Part-of-Speech and Morphological Tagset for Swedish}, booktitle = {Seventh Swedish Language Technology Conference (SLTC), Stockholm, 7-9 November, 2018}, author = {Adesam, Yvonne and Bouma, Gerlof and Johansson, Richard}, year = {2018}, } @inProceedings{Adesam-Yvonne2018-273839, title = {The Eukalyptus Treebank of Written Swedish}, booktitle = {Seventh Swedish Language Technology Conference (SLTC), Stockholm, 7–9 November 2018}, author = {Adesam, Yvonne and Bouma, Gerlof and Johansson, Richard and Borin, Lars and Forsberg, Markus}, year = {2018}, } @inProceedings{Fares-Murhaf2018-272105, title = {The 2018 Shared Task on Extrinsic Parser Evaluation: On the Downstream Utility of English Universal Dependency Parsers}, abstract = {We summarize empirical results and tentative conclusions from the Second Extrinsic Parser Evaluation Initiative (EPE 2018). We review the basic task setup, downstream applications involved, and end-to-end results for seventeen participating parsers. Based on both quantitative and qualitative analysis, we correlate intrinsic evaluation results at different layers of morph-syntactic analysis with observed downstream behavior.}, booktitle = {Proceedings of the CoNLL 2018 Shared Task: Multilingual Parsing from Raw Text to Universal Dependencies}, author = {Fares, Murhaf and Oepen, Stephan and Øvrelid, Lilja and Björne, Jari and Johansson, Richard}, year = {2018}, publisher = {Association for Computational Linguistics}, } @inProceedings{Adesam-Yvonne2018-273841, title = {The Koala Part-of-Speech and Morphological Tagset for Swedish}, booktitle = {Seventh Swedish Language Technology Conference (SLTC), Stockholm, 7-9 November, 2018}, author = {Adesam, Yvonne and Bouma, Gerlof and Johansson, Richard}, year = {2018}, } @inProceedings{Adesam-Yvonne2018-273839, title = {The Eukalyptus Treebank of Written Swedish}, booktitle = {Seventh Swedish Language Technology Conference (SLTC), Stockholm, 7–9 November 2018}, author = {Adesam, Yvonne and Bouma, Gerlof and Johansson, Richard and Borin, Lars and Forsberg, Markus}, year = {2018}, } @inProceedings{Fares-Murhaf2018-272105, title = {The 2018 Shared Task on Extrinsic Parser Evaluation: On the Downstream Utility of English Universal Dependency Parsers}, abstract = {We summarize empirical results and tentative conclusions from the Second Extrinsic Parser Evaluation Initiative (EPE 2018). We review the basic task setup, downstream applications involved, and end-to-end results for seventeen participating parsers. Based on both quantitative and qualitative analysis, we correlate intrinsic evaluation results at different layers of morph-syntactic analysis with observed downstream behavior.}, booktitle = {Proceedings of the CoNLL 2018 Shared Task: Multilingual Parsing from Raw Text to Universal Dependencies}, author = {Fares, Murhaf and Oepen, Stephan and Øvrelid, Lilja and Björne, Jari and Johansson, Richard}, year = {2018}, publisher = {Association for Computational Linguistics}, } @inProceedings{Adesam-Yvonne2018-273841, title = {The Koala Part-of-Speech and Morphological Tagset for Swedish}, booktitle = {Seventh Swedish Language Technology Conference (SLTC), Stockholm, 7-9 November, 2018}, author = {Adesam, Yvonne and Bouma, Gerlof and Johansson, Richard}, year = {2018}, } @inProceedings{Adesam-Yvonne2018-273839, title = {The Eukalyptus Treebank of Written Swedish}, booktitle = {Seventh Swedish Language Technology Conference (SLTC), Stockholm, 7–9 November 2018}, author = {Adesam, Yvonne and Bouma, Gerlof and Johansson, Richard and Borin, Lars and Forsberg, Markus}, year = {2018}, } @inProceedings{Fares-Murhaf2018-272105, title = {The 2018 Shared Task on Extrinsic Parser Evaluation: On the Downstream Utility of English Universal Dependency Parsers}, abstract = {We summarize empirical results and tentative conclusions from the Second Extrinsic Parser Evaluation Initiative (EPE 2018). We review the basic task setup, downstream applications involved, and end-to-end results for seventeen participating parsers. Based on both quantitative and qualitative analysis, we correlate intrinsic evaluation results at different layers of morph-syntactic analysis with observed downstream behavior.}, booktitle = {Proceedings of the CoNLL 2018 Shared Task: Multilingual Parsing from Raw Text to Universal Dependencies}, author = {Fares, Murhaf and Oepen, Stephan and Øvrelid, Lilja and Björne, Jari and Johansson, Richard}, year = {2018}, publisher = {Association for Computational Linguistics}, } @inProceedings{Adesam-Yvonne2018-273841, title = {The Koala Part-of-Speech and Morphological Tagset for Swedish}, booktitle = {Seventh Swedish Language Technology Conference (SLTC), Stockholm, 7-9 November, 2018}, author = {Adesam, Yvonne and Bouma, Gerlof and Johansson, Richard}, year = {2018}, } @inProceedings{Adesam-Yvonne2018-273839, title = {The Eukalyptus Treebank of Written Swedish}, booktitle = {Seventh Swedish Language Technology Conference (SLTC), Stockholm, 7–9 November 2018}, author = {Adesam, Yvonne and Bouma, Gerlof and Johansson, Richard and Borin, Lars and Forsberg, Markus}, year = {2018}, } @inProceedings{Fares-Murhaf2018-272105, title = {The 2018 Shared Task on Extrinsic Parser Evaluation: On the Downstream Utility of English Universal Dependency Parsers}, abstract = {We summarize empirical results and tentative conclusions from the Second Extrinsic Parser Evaluation Initiative (EPE 2018). We review the basic task setup, downstream applications involved, and end-to-end results for seventeen participating parsers. Based on both quantitative and qualitative analysis, we correlate intrinsic evaluation results at different layers of morph-syntactic analysis with observed downstream behavior.}, booktitle = {Proceedings of the CoNLL 2018 Shared Task: Multilingual Parsing from Raw Text to Universal Dependencies}, author = {Fares, Murhaf and Oepen, Stephan and Øvrelid, Lilja and Björne, Jari and Johansson, Richard}, year = {2018}, publisher = {Association for Computational Linguistics}, } @inProceedings{Adesam-Yvonne2018-273841, title = {The Koala Part-of-Speech and Morphological Tagset for Swedish}, booktitle = {Seventh Swedish Language Technology Conference (SLTC), Stockholm, 7-9 November, 2018}, author = {Adesam, Yvonne and Bouma, Gerlof and Johansson, Richard}, year = {2018}, } @inProceedings{Adesam-Yvonne2018-273839, title = {The Eukalyptus Treebank of Written Swedish}, booktitle = {Seventh Swedish Language Technology Conference (SLTC), Stockholm, 7–9 November 2018}, author = {Adesam, Yvonne and Bouma, Gerlof and Johansson, Richard and Borin, Lars and Forsberg, Markus}, year = {2018}, } @inProceedings{Fares-Murhaf2018-272105, title = {The 2018 Shared Task on Extrinsic Parser Evaluation: On the Downstream Utility of English Universal Dependency Parsers}, abstract = {We summarize empirical results and tentative conclusions from the Second Extrinsic Parser Evaluation Initiative (EPE 2018). We review the basic task setup, downstream applications involved, and end-to-end results for seventeen participating parsers. Based on both quantitative and qualitative analysis, we correlate intrinsic evaluation results at different layers of morph-syntactic analysis with observed downstream behavior.}, booktitle = {Proceedings of the CoNLL 2018 Shared Task: Multilingual Parsing from Raw Text to Universal Dependencies}, author = {Fares, Murhaf and Oepen, Stephan and Øvrelid, Lilja and Björne, Jari and Johansson, Richard}, year = {2018}, publisher = {Association for Computational Linguistics}, } @inProceedings{Adesam-Yvonne2018-273841, title = {The Koala Part-of-Speech and Morphological Tagset for Swedish}, booktitle = {Seventh Swedish Language Technology Conference (SLTC), Stockholm, 7-9 November, 2018}, author = {Adesam, Yvonne and Bouma, Gerlof and Johansson, Richard}, year = {2018}, } @inProceedings{Adesam-Yvonne2018-273839, title = {The Eukalyptus Treebank of Written Swedish}, booktitle = {Seventh Swedish Language Technology Conference (SLTC), Stockholm, 7–9 November 2018}, author = {Adesam, Yvonne and Bouma, Gerlof and Johansson, Richard and Borin, Lars and Forsberg, Markus}, year = {2018}, } @inProceedings{Fares-Murhaf2018-272105, title = {The 2018 Shared Task on Extrinsic Parser Evaluation: On the Downstream Utility of English Universal Dependency Parsers}, abstract = {We summarize empirical results and tentative conclusions from the Second Extrinsic Parser Evaluation Initiative (EPE 2018). We review the basic task setup, downstream applications involved, and end-to-end results for seventeen participating parsers. Based on both quantitative and qualitative analysis, we correlate intrinsic evaluation results at different layers of morph-syntactic analysis with observed downstream behavior.}, booktitle = {Proceedings of the CoNLL 2018 Shared Task: Multilingual Parsing from Raw Text to Universal Dependencies}, author = {Fares, Murhaf and Oepen, Stephan and Øvrelid, Lilja and Björne, Jari and Johansson, Richard}, year = {2018}, publisher = {Association for Computational Linguistics}, } @inProceedings{Adouane-Wafia2017-252493, title = {Romanized Arabic and Berber Detection Using PPM and Dictionary Methods}, abstract = {Arabic is one of the Semitic languages written in Arabic script in its standard form. However, the recent rise of social media and new technologies has contributed considerably to the emergence of a new form of Arabic, namely Arabic written in Latin scripts, often called Romanized Arabic or Arabizi. While Romanized Arabic is an informal language, Berber or Tamazight uses Latin script in its standard form with some orthography differences depending on the country it is used in. Both these languages are under-resourced and unknown to the state-of-the-art language identifiers. In this paper, we present a language automatic identifier for both Romanized Arabic and Romanized Berber. We also describe the built linguistic resources (large dataset and lexicons) including a wide range of Arabic dialects (Algerian, Egyptian, Gulf, Iraqi, Levantine, Moroccan and Tunisian dialects) as well as the most popular Berber varieties (Kabyle, Tashelhit, Tarifit, Tachawit and Tamzabit). We use the Prediction by Partial Matching (PPM) and dictionary-based methods. The methods reach a macro-average F-Measure of 98.74% and 97.60% respectively.}, booktitle = {13th ACS/IEEE International Conference on Computer Systems and Applications AICCSA 2016}, author = {Adouane, Wafia and Semmar, Nasredine and Johansson, Richard}, year = {2017}, adress = {Morocco}, ISBN = { 978-150904320-0}, } @inProceedings{Mogren-Olof2017-256929, title = {Character-based Recurrent Neural Networks for Morphological Relational Reasoning}, abstract = {We present a model for predicting word forms based on morphological relational reasoning with analogies. While previous work has explored tasks such as morphological inflection and reinflection, these models rely on an explicit enumeration of morphological features, which may not be available in all cases. To address the task of predicting a word form given a demo relation (a pair of word forms) and a query word, we devise a character-based recurrent neural network architecture using three separate encoders and a decoder. We also investigate a multiclass learning setup, where the prediction of the relation type label is used as an auxiliary task. Our results show that the exact form can be predicted for English with an accuracy of 94.7%. For Swedish, which has a more complex morphology with more inflectional patterns for nouns and verbs, the accuracy is 89.3%. We also show that using the auxiliary task of learning the relation type speeds up convergence and improves the prediction accuracy for the word generation task.}, booktitle = {Proceedings of the First Workshop on Subword and Character Level Models in NLP}, author = {Mogren, Olof and Johansson, Richard}, year = {2017}, publisher = {Association for Computational Linguistics}, adress = {Stroudsburg, PA, United States}, } @inProceedings{NietoPiña-Luis2017-261938, title = {Training Word Sense Embeddings With Lexicon-based Regularization}, abstract = {We propose to improve word sense embeddings by enriching an automatic corpus-based method with lexicographic data. Information from a lexicon is introduced into the learning algorithm’s objective function through a regularizer. The incorporation of lexicographic data yields embeddings that are able to reflect expertdefined word senses, while retaining the robustness, high quality, and coverage of automatic corpus-based methods. These properties are observed in a manual inspection of the semantic clusters that different degrees of regularizer strength create in the vector space. Moreover, we evaluate the sense embeddings in two downstream applications: word sense disambiguation and semantic frame prediction, where they outperform simpler approaches. Our results show that a corpusbased model balanced with lexicographic data learns better representations and improve their performance in downstream tasks}, booktitle = {Proceedings of the Eighth International Joint Conference on Natural Language Processing (Volume 1: Long Papers), Taipei, Taiwan, November 27 – December 1, 2017}, author = {Nieto Piña, Luis and Johansson, Richard}, year = {2017}, publisher = {Asian Federation of Natural Language Processing }, ISBN = {978-1-948087-00-1}, } @inProceedings{Oepen-Stephan2017-264156, title = {The 2017 Shared Task on Extrinsic Parser Evaluation. Towards a Reusable Community Infrastructure}, abstract = {The 2017 Shared Task on Extrinsic Parser Evaluation (EPE 2017) seeks to provide better estimates of the relative utility of different types of dependency representa- tions for a variety of downstream applica- tions that depend centrally on the analysis of grammatical structure. EPE 2017 de- fi nes a generalized notion of lexicalized syntactico-semantic dependency represen- tations and provides a common interchange format to three state-of-the-art downstream applications, viz. biomedical event extrac- tion, negation resolution, and fi ne-grained opinion analysis. As a fi rst step towards building a generic and extensible infras- tructure for extrinsic parser evaluation, the downstream applications have been gener- alized to support a broad range of diverese dependency representations (including di- vergent sentence and token boundaries) and to allow fully automated re-training and evaluation for a speci fi c collection of parser outputs. Nine teams participated in EPE 2017, submitting 49 distinct runs that encompass many different families of dependency representations, distinct ap- proaches to preprocessing and parsing, and various types and volumes of training data.}, booktitle = {Proceedings of the 2017 Shared Task on Extrinsic Parser Evaluation at the Fourth International Conference on Dependency Linguistics and the 15th International Conference on Parsing Technologies}, author = {Oepen, Stephan and Øvrelid, Lilja and Björne, Jari and Johansson, Richard and Lapponi, Emanuele and Ginter, Filip and Velldal, Erik}, year = {2017}, publisher = {Association for Computational Linguistics (ACL)}, adress = {Stroudsburg, USA}, ISBN = {978-1-945626-74-6}, } @inProceedings{Johansson-Richard2017-264160, title = {EPE 2017: The Trento–Gothenburg Opinion Extraction System}, abstract = {We give an overview of one of the three downstream systems in the Extrin- sic Parser Evaluation shared task of 2017: the Trento–Gothenburg system for opin- ion extraction. We describe the modi fi ca- tions required to make the system agnos- tic to its input dependency representation, and discuss how the input affects the vari- ous submodules of the system. The results of the EPE shared task are presented and discussed, and to get a more detailed un- derstanding of the effects of the dependen- cies we run two of the submodules sepa- rately. The results suggest that the module where the effects are strongest is the opin- ion holder extraction module, which can be explained by the fact that this module uses several dependency-based features. For the other modules, the effects are hard to measure.}, booktitle = {Proceedings of the 2017 Shared Task on Extrinsic Parser Evaluation at the Fourth International Conference on Dependency Linguistics and the 15th International Conference on Parsing Technologies}, author = {Johansson, Richard}, year = {2017}, publisher = {Association for Computational Linguistics (ACL) }, adress = {Stroudsburg, USA}, ISBN = {978-1-945626-74-6 }, } @inProceedings{Johansson-Richard2016-233140, title = {A Multi-domain Corpus of Swedish Word Sense Annotation}, abstract = {We describe the word sense annotation layer in Eukalyptus, a freely available five-domain corpus of contemporary Swedish with several annotation layers. The annotation uses the SALDO lexicon to define the sense inventory, and allows word sense annotation of compound segments and multiword units. We give an overview of the new annotation tool developed for this project, and finally present an analysis of the inter-annotator agreement between two annotators. }, booktitle = {10th edition of the Language Resources and Evaluation Conference, 23-28 May 2016, Portorož (Slovenia)}, author = {Johansson, Richard and Adesam, Yvonne and Bouma, Gerlof and Hedberg, Karin}, year = {2016}, publisher = {European Language Resources Association}, ISBN = {978-2-9517408-9-1}, } @inProceedings{NietoPiña-Luis2016-241139, title = {Embedding Senses for Efficient Graph-based Word Sense Disambiguation}, abstract = {We propose a simple graph-based method for word sense disambiguation (WSD) where sense and context embeddings are constructed by applying the Skip-gram method to random walks over the sense graph. We used this method to build a WSD system for Swedish using the SALDO lexicon, and evaluated it on six different annotated test sets. In all cases, our system was several orders of magnitude faster than a state-of-the-art PageRank-based system, while outperforming a random baseline soundly.}, booktitle = { Proceedings of TextGraphs-10: the Workshop on Graph-based Methods for Natural Language Processing}, author = {Nieto Piña, Luis and Johansson, Richard}, year = {2016}, publisher = {Association for Computational Linguistics}, } @inProceedings{Adouane-Wafia2016-242243, title = {Gulf Arabic Resource Building for Sentiment Analysis}, abstract = {This paper deals with building linguistic resources for Gulf Arabic, one of the Arabic variations, for sentiment analysis task using machine learning. To our knowledge, no previous works were done for Gulf Arabic sentiment analysis despite the fact that it is present in different online platforms. Hence, the first challenge is the absence of annotated data and sentiment lexicons. To fill this gap, we created these two main linguistic resources. Then we conducted different experiments: use Naive Bayes classifier without any lexicon; add a sentiment lexicon designed basically for MSA; use only the compiled Gulf Arabic sentiment lexicon and finally use both MSA and Gulf Arabic sentiment lexicons. The Gulf Arabic lexicon gives a good improvement of the classifier accuracy (90.54 %) over a baseline that does not use the lexicon (82.81%), while the MSA lexicon causes the accuracy to drop to (76.83%). Moreover, mixing MSA and Gulf Arabic lexicons causes the accuracy to drop to (84.94%) compared to using only Gulf Arabic lexicon. This indicates that it is useless to use MSA resources to deal with Gulf Arabic due to the considerable differences and conflicting structures between these two languages.}, booktitle = {Proceedings of the Language Resources and Evaluation Conference (LREC), 23-28 May 2016, Portorož, Slovenia}, author = {Adouane, Wafia and Johansson, Richard}, year = {2016}, publisher = {European Language Resources Association}, ISBN = {978-2-9517408-9-1}, } @inProceedings{Ehrlemark-Anna2016-242241, title = {Retrieving Occurrences of Grammatical Constructions}, abstract = {Finding authentic examples of grammatical constructions is central in constructionist approaches to linguistics, language processing, and second language learning. In this paper, we address this problem as an information retrieval (IR) task. To facilitate research in this area, we built a benchmark collection by annotating the occurrences of six constructions in a Swedish corpus. Furthermore, we implemented a simple and flexible retrieval system for finding construction occurrences, in which the user specifies a ranking function using lexical-semantic similarities (lexicon-based or distributional). The system was evaluated using standard IR metrics on the new benchmark, and we saw that lexical-semantical rerankers improve significantly over a purely surface-oriented system, but must be carefully tailored for each individual construction. }, booktitle = {Proceedings of COLING 2016, the 26th International Conference on Computational Linguistics : Technical Papers, December 11–17; Osaka, Japan}, author = {Ehrlemark, Anna and Johansson, Richard and Lyngfelt, Benjamin}, year = {2016}, ISBN = {978-4-87974-702-0}, } @inProceedings{Adouane-Wafia2016-246853, title = {ASIREM Participation at the Discriminating Similar Languages Shared Task 2016}, booktitle = {Proceedings of the Third Workshop on NLP for Similar Languages, Varieties and Dialects; 163–169; December 12; Osaka, Japan}, author = {Adouane, Wafia and Semmar, Nasredine and Johansson, Richard}, year = {2016}, } @inProceedings{Adouane-Wafia2016-246849, title = {Romanized Berber and Romanized Arabic Automatic Language Identification Using Machine Learning}, abstract = {The identification of the language of text/speech input is the first step to be able to properly do any language-dependent natural language processing. The task is called Automatic Language Identification (ALI). Being a well-studied field since early 1960’s, various methods have been applied to many standard languages. The ALI standard methods require datasets for training and use character/word-based n-gram models. However, social media and new technologies have contributed to the rise of informal and minority languages on the Web. The state-of-the-art automatic language identifiers fail to properly identify many of them. Romanized Arabic (RA) and Romanized Berber (RB) are cases of these informal languages which are under-resourced. The goal of this paper is twofold: detect RA and RB, at a document level, as separate languages and distinguish between them as they coexist in North Africa. We consider the task as a classification problem and use supervised machine learning to solve it. For both languages, character-based 5-grams combined with additional lexicons score the best, F-score of 99.75% and 97.77% for RB and RA respectively.}, booktitle = {Proceedings of the Third Workshop on NLP for Similar Languages, Varieties and Dialects; 53–61; December 12, 2016 ; Osaka, Japan}, author = {Adouane, Wafia and Semmar, Nasredine and Johansson, Richard}, year = {2016}, publisher = {Association for Computational Linguistics}, } @inProceedings{Adouane-Wafia2016-246765, title = {Automatic Detection of Arabicized Berber and Arabic Varieties}, abstract = {Automatic Language Identification (ALI) is the detection of the natural language of an input text by a machine. It is the first necessary step to do any language-dependent natural language processing task. Various methods have been successfully applied to a wide range of languages, and the state-of-the-art automatic language identifiers are mainly based on character n-gram models trained on huge corpora. However, there are many languages which are not yet automatically processed, for instance minority and informal languages. Many of these languages are only spoken and do not exist in a written format. Social media platforms and new technologies have facilitated the emergence of written format for these spoken languages based on pronunciation. The latter are not well represented on the Web, commonly referred to as under-resourced languages, and the current available ALI tools fail to properly recognize them. In this paper, we revisit the problem of ALI with the focus on Arabicized Berber and dialectal Arabic short texts. We introduce new resources and evaluate the existing methods. The results show that machine learning models combined with lexicons are well suited for detecting Arabicized Berber and different Arabic varieties and distinguishing between them, giving a macro-average F-score of 92.94%.}, booktitle = {Proceedings of the Third Workshop on NLP for Similar Languages, Varieties and Dialects; 63–72; December 12; Osaka, Japan}, author = {Adouane, Wafia and Semmar, Nasredine and Johansson, Richard and Bobicev, Victoria}, year = {2016}, } @article{NietoPiña-Luis2016-251412, title = {Benchmarking Word Sense Disambiguation Systems for Swedish}, abstract = {We compare several word sense disambiguation systems for Swedish and evaluate them on seven different sense-annotated corpora. Our results show that unsupervised systems beat a random baseline, but generally do not outperform a first-sense baseline considerably. On a lexical-sample dataset that allows us to train a supervised system, the unsupervised disambiguators are strongly outperformed by the supervised one.}, author = {Nieto Piña, Luis and Johansson, Richard}, year = {2016}, } @inProceedings{Adouane-Wafia2016-252492, title = {Arabicized and Romanized Berber Automatic Identification}, abstract = {We present an automatic language identification tool for both Arabicized Berber (Berber written in the Arabic script) and Romanized Berber (Berber written in the Latin script). The focus is on short texts (social media content). We use supervised machine learning method with character and word-based n-gram models as features. We also describe the corpora used in this paper. For both Arabicized and Romanized Berber, character-based 5-grams score the best giving an F-score of 99.50%.}, booktitle = {Proceedings of TICAM 2016}, author = {Adouane, Wafia and Semmar, Nasredine and Johansson, Richard}, year = {2016}, publisher = {IRCAM}, adress = {Morocco}, } @inProceedings{Adouane-Wafia2016-255457, title = {Romanized Arabic and Berber Detection Using Prediction by Partial Matching and Dictionary Methods}, abstract = {Arabic is one of the Semitic languages written in Arabic script in its standard form. However, the recent rise of social media and new technologies has contributed considerably to the emergence of a new form of Arabic, namely Arabic written in Latin scripts, often called Romanized Arabic or Arabizi. While Romanized Arabic is an informal language, Berber or Tamazight uses Latin script in its standard form with some orthography differences depending on the country it is used in. Both these languages are under-resourced and unknown to the state-of-theart language identifiers. In this paper, we present a language automatic identifier for both Romanized Arabic and Romanized Berber. We also describe the built linguistic resources (large dataset and lexicons) including a wide range of Arabic dialects (Algerian, Egyptian, Gulf, Iraqi, Levantine, Moroccan and Tunisian dialects) as well as the most popular Berber varieties (Kabyle, Tashelhit, Tarifit, Tachawit and Tamzabit). We use the Prediction by Partial Matching (PPM) and dictionary-based methods. The methods reach a macro-average F-Measure of 98.74% and 97.60% respectively.}, booktitle = {2016 IEEE/ACS 13TH INTERNATIONAL CONFERENCE OF COMPUTER SYSTEMS AND APPLICATIONS (AICCSA)}, author = {Adouane, Wafia and Semmar, N. and Johansson, Richard}, year = {2016}, ISBN = {978-1-5090-4320-0}, } @article{Tahmasebi-Nina2015-212969, title = {Visions and open challenges for a knowledge-based culturomics}, abstract = {The concept of culturomics was born out of the availability of massive amounts of textual data and the interest to make sense of cultural and language phenomena over time. Thus far however, culturomics has only made use of, and shown the great potential of, statistical methods. In this paper, we present a vision for a knowledge-based culturomics that complements traditional culturomics. We discuss the possibilities and challenges of combining knowledge-based methods with statistical methods and address major challenges that arise due to the nature of the data; diversity of sources, changes in language over time as well as temporal dynamics of information in general. We address all layers needed for knowledge-based culturomics, from natural language processing and relations to summaries and opinions.}, author = {Tahmasebi, Nina and Borin, Lars and Capannini, Gabriele and Dubhashi, Devdatt and Exner, Peter and Forsberg, Markus and Gossen, Gerhard and Johansson, Fredrik and Johansson, Richard and Kågebäck, Mikael and Mogren, Olof and Nugues, Pierre and Risse, Thomas}, year = {2015}, volume = {15}, number = {2-4}, pages = {169--187}, } @inProceedings{Johansson-Richard2015-216865, title = {Combining Relational and Distributional Knowledge for Word Sense Disambiguation}, abstract = {We present a new approach to word sense disambiguation derived from recent ideas in distributional semantics. The input to the algorithm is a large unlabeled corpus and a graph describing how senses are related; no sense-annotated corpus is needed. The fundamental idea is to embed meaning representations of senses in the same continuous-valued vector space as the representations of words. In this way, the knowledge encoded in the lexical resource is combined with the infor- mation derived by the distributional methods. Once this step has been carried out, the sense representations can be plugged back into e.g. the skip-gram model, which allows us to compute scores for the different possible senses of a word in a given context. We evaluated the new word sense disambiguation system on two Swedish test sets annotated with senses defined by the SALDO lexical resource. In both evaluations, our system soundly outperformed random and first-sense baselines. Its accuracy was slightly above that of a well- known graph-based system, while being computationally much more efficient,}, booktitle = {Proceedings of the 20th Nordic Conference of Computational Linguistics, May 12-13, Vilnius, Lithuania. Linköping Electronic Conference Proceedings 109, Linköping University Electronic Press..}, author = {Johansson, Richard and Nieto Piña, Luis}, year = {2015}, ISBN = {978-91-7519-098-3}, pages = {69--78}, } @inProceedings{Borin-Lars2015-217351, title = {Here be dragons? The perils and promises of inter-resource lexical-semantic mapping}, abstract = {Lexical-semantic knowledges sources are a stock item in the language technologist’s toolbox, having proved their practical worth in many and diverse natural language processing (NLP) applications. In linguistics, lexical semantics comes in many flavors, but in the NLP world, wordnets reign more or less supreme. There has been some promising work utilizing Roget-style thesauruses instead, but wider experimentation is hampered by the limited availability of such resources. The work presented here is a first step in the direction of creating a freely available Roget-style lexical resource for modern Swedish. Here, we explore methods for automatic disambiguation of interresource mappings with the longer-term goal of utilizing similar techniques for automatic enrichment of lexical-semantic resources.}, booktitle = {Linköping Electronic Conference Proceedings. Semantic resources and semantic annotation for Natural Language Processing and the Digital Humanities. Workshop at NODALIDA , May 11, 13-18 2015, Vilnius}, author = {Borin, Lars and Nieto Piña, Luis and Johansson, Richard}, year = {2015}, volume = {112}, ISBN = {978-91-7519-049-5}, pages = {1--11}, } @inProceedings{Adesam-Yvonne2015-217815, title = {Defining the Eukalyptus forest – the Koala treebank of Swedish}, abstract = {This paper details the design of the lexical and syntactic layers of a new annotated corpus of Swedish contemporary texts. In order to make the corpus adaptable into a variety of representations, the annotation is of a hybrid type with head-marked constituents and function-labeled edges, and with a rich annotation of non-local dependencies. The source material has been taken from public sources, to allow the resulting corpus to be made freely available.}, booktitle = {Proceedings of the 20th Nordic Conference of Computational Linguistics, NODALIDA 2015, May 11-13, 2015, Vilnius, Lithuania. Edited by Beáta Megyesi}, author = {Adesam, Yvonne and Bouma, Gerlof and Johansson, Richard}, year = {2015}, ISBN = {978-91-7519-098-3}, pages = {1--9}, } @inProceedings{Johansson-Richard2015-217863, title = {Embedding a Semantic Network in a Word Space}, abstract = {We present a framework for using continuous- space vector representations of word meaning to derive new vectors representing the meaning of senses listed in a semantic network. It is a post-processing approach that can be applied to several types of word vector representations. It uses two ideas: first, that vectors for polysemous words can be decomposed into a convex combination of sense vectors; secondly, that the vector for a sense is kept similar to those of its neighbors in the network.This leads to a constrained optimization problem, and we present an approximation for the case when the distance function is the squared Euclidean. We applied this algorithm on a Swedish semantic network, and we evaluate the quality of the resulting sense representations extrinsically by showing that they give large improvements when used in a classifier that creates lexical units for FrameNet frames. }, booktitle = {Proceedings of the 2015 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies. Denver, United States, May 31 – June 5, 2015}, author = {Johansson, Richard and Nieto Piña, Luis}, year = {2015}, ISBN = {978-1-941643-49-5}, pages = {1428--1433}, } @inProceedings{Kågebäck-Mikael2015-217864, title = {Neural context embeddings for automatic discovery of word senses}, abstract = {Word sense induction (WSI) is the problem of automatically building an inventory of senses for a set of target words using only a text corpus. We introduce a new method for embedding word instances and their context, for use in WSI. The method, Instance-context embedding (ICE), leverages neural word embeddings, and the correlation statistics they capture, to compute high quality embeddings of word contexts. In WSI, these context embeddings are clustered to find the word senses present in the text. ICE is based on a novel method for combining word embeddings using continuous Skip-gram, based on both se- mantic and a temporal aspects of context words. ICE is evaluated both in a new system, and in an extension to a previous system for WSI. In both cases, we surpass previous state-of-the-art, on the WSI task of SemEval-2013, which highlights the generality of ICE. Our proposed system achieves a 33% relative improvement.}, booktitle = {Proceedings of the 1st Workshop on Vector Space Modeling for Natural Language Processing. Denver, United States}, author = {Kågebäck, Mikael and Johansson, Fredrik and Johansson, Richard and Dubhashi, Devdatt}, year = {2015}, pages = {25--32}, } @inProceedings{NietoPiña-Luis2015-222611, title = {A Simple and Efficient Method to Generate Word Sense Representations}, abstract = {Distributed representations of words have boosted the performance of many Natural Language Processing tasks. However, usually only one representation per word is obtained, not acknowledging the fact that some words have multiple meanings. This has a negative effect on the individual word representations and the language model as a whole. In this paper we present a simple model that enables recent techniques for building word vectors to represent distinct senses of polysemic words. In our assessment of this model we show that it is able to effectively discriminate between words’ senses and to do so in a computationally efficient manner.}, booktitle = {Proceedings of International Conference in Recent Advances in Natural Language Processing / edited by Galia Angelova, Kalina Bontcheva, Ruslan Mitkov, Hissar, Bulgaria 7–9 September, 2015}, author = {Nieto Piña, Luis and Johansson, Richard}, year = {2015}, pages = {465--472}, } @inProceedings{Ghanimifard-Mehdi2015-222749, title = {Enriching Word-sense Embeddings with Translational Context}, abstract = {Vector-space models derived from corpora are an effective way to learn a representation of word meaning directly from data, and these models have many uses in practical applications. A number of unsupervised approaches have been proposed to automatically learn representations of word senses directly from corpora, but since these methods use no information but the words themselves, they sometimes miss distinctions that could be possible to make if more information were available. In this paper, we present a general framework that we call context enrichment that incorporates external information during the training of multi-sense vector-space models. Our approach is agnostic as to which external signal is used to enrich the context, but in this work we consider the use of translations as the source of enrichment. We evaluated the models trained using the translation-enriched context using several similarity benchmarks and a word analogy test set. In all our evaluations, the enriched model outperformed the purely word-based baseline soundly. }, booktitle = {Proceedings of Recent Advances in Natural Language Processing / edited by Galia Angelova, Kalina Bontcheva, Ruslan Mitkov. International Conference, Hissar, Bulgaria 7–9 September, 2015}, author = {Ghanimifard, Mehdi and Johansson, Richard}, year = {2015}, pages = {208--215}, } @inProceedings{Adesam-Yvonne2015-228833, title = {Multiwords, Word Senses and Multiword Senses in the Eukalyptus Treebank of Written Swedish}, abstract = {Multiwords reside at the intersection of the lexicon and syntax and in an annotation project, they will affect both levels. In the Eukalyptus treebank of written Swedish, we treat multiwords formally as syntactic objects, which are assigned a lexical type and sense. With the help of a simple dichotomy, analyzed vs unanalyzed multiwords, and the expressiveness of the syntactic annotation formalism employed, we are able to flexibly handle most multiword types and usages.}, booktitle = {Proceedings of the Fourteenth International Workshop on Treebanks and Linguistic Theories (TLT14), 11–12 December 2015 Warsaw, Poland}, author = {Adesam, Yvonne and Bouma, Gerlof and Johansson, Richard}, year = {2015}, ISBN = {978-83-63159-18-4}, pages = {3--12}, } @article{Borin-Lars2014-192931, title = {Kulturomik: Att spana efter språkliga och kulturella förändringar i digitala textarkiv}, author = {Borin, Lars and Johansson, Richard}, year = {2014}, } @article{Dupplaw-David2014-195563, title = {Information extraction from multimedia web documents: an open-source platform and testbed}, abstract = {The LivingKnowledge project aimed to enhance the current state of the art in search, retrieval and knowledge management on the web by advancing the use of sentiment and opinion analysis within multimedia applications. To achieve this aim, a diverse set of novel and complementary analysis techniques have been integrated into a single, but extensible software platform on which such applications can be built. The platform combines state-of-the-art techniques for extracting facts, opinions and sentiment from multimedia documents, and unlike earlier platforms, it exploits both visual and textual techniques to support multimedia information retrieval. Foreseeing the usefulness of this software in the wider community, the platform has been made generally available as an open-source project. This paper describes the platform design, gives an overview of the analysis algorithms integrated into the system and describes two applications that utilise the system for multimedia information retrieval.}, author = {Dupplaw, David and Matthews, Michael and Johansson, Richard and Boato, Giulia and Costanzo, Andrea and Fontani, Marco and Minack, Enrico and Demidova, Elena and Blanco, Roi and Griffiths, Thomas and Lewis, Paul and Hare, Jonathon and Moschitti, Alessandro}, year = {2014}, volume = {3}, number = {2}, pages = {97--111}, } @inProceedings{Pil?n-Ildik?2014-10, title = "Rule-based and machine learning approaches for second language sentence-level readability.", booktitle = "Proceeding of the ACL 2014 9th Workshop on Innovative Use of NLP for Building Educational Applications, Baltimore, june 22-27 2014", author = "Pilán, Ildikó and Volodina, Elena and Johansson, Richard", year = "2014", isbn = "978-1-941643-03-7", url = "http://www.aclweb.org/anthology/W/W14/W14-1821.pdf", pages = "174--184", } @inProceedings{Günther-Tobias2014-201512, title = {RTRGO: Enhancing the GU-MLT-LT System for Sentiment Analysis of Short Messages}, abstract = {This paper describes the enhancements made to our GU-MLT-LT system (Günther and Furrer, 2013) for the SemEval-2014 re-run of the SemEval-2013 shared task on sentiment analysis in Twitter. The changes include the usage of a Twitter-specific tokenizer, additional features and sentiment lexica, feature weighting and random subspace learning. The improvements result in an increase of 4.18 F-measure points on this year’s Twitter test set, ranking 3rd. }, booktitle = {Proceedings of the 8th International Workshop on Semantic Evaluation (SemEval 2014) August 23-24, 2014 Dublin, Ireland}, author = {Günther, Tobias and Vancoppenolle, Jean and Johansson, Richard}, year = {2014}, ISBN = {978-1-941643-24-2}, pages = {497--502}, } @article{Johansson-Richard2014-201874, title = {Automatic Expansion of the Swedish FrameNet Lexicon}, abstract = {We evaluate several lexicon-based and corpus-based methods to automatically induce new lexical units for the Swedish FrameNet, and we see that the best-performing setup uses a combination of both types of methods. A particular challenge for Swedish is the absence of a lexical resource such as WordNet; however, we show that the semantic network SALDO, which is organized according to lexicographical principles quite different from those of WordNet, is very useful for our purposes.}, author = {Johansson, Richard}, year = {2014}, volume = {6}, number = {1}, pages = {92--113}, } @article{Forsberg-Markus2014-3, title = "From construction candidates to constructicon entries: An experiment using semi-automatic methods for identifying constructions in corpora", journal = "Constructions and Frames", author = "Forsberg, Markus and Johansson, Richard and Bäckström, Linnéa and Borin, Lars and Lyngfelt, Benjamin and Olofsson, Joel and Prentice, Julia", year = "2014", volume = "6", number = "1", url = "http://www.jbe-platform.com/content/journals/10.1075/cf.6.1.07for", pages = "114--135", } @article{Forsberg-Markus2014-208123, title = {From construction candidates to constructicon entries: An experiment using semi-automatic methods for identifying constructions in corpora}, abstract = { We present an experiment where natural language processing tools are used to automatically identify potential constructions in a corpus. e experiment was conducted as part of the ongoing efforts to develop a Swedish constructicon. Using an automatic method to suggest constructions has advantages not only for efficiency but also methodologically: it forces the analyst to look more objec-tively at the constructions actually occurring in corpora, as opposed to focusing on “interesting” constructions only. As a heuristic for identifying potential con-structions, the method has proved successful, yielding about 200 (out of 1,200) highly relevant construction candidates.}, author = {Forsberg, Markus and Johansson, Richard and Bäckström, Linnéa and Borin, Lars and Lyngfelt, Benjamin and Olofsson, Joel and Prentice, Julia}, year = {2014}, volume = {6}, number = {1, 2014}, pages = {114--135}, } @inProceedings{Ahlberg-Malin2014-210083, title = {Swedish FrameNet++ The Beginning of the End and the End of the Beginning}, booktitle = {Proceedings of the Fifth Swedish Language Technology Conference, Uppsala, 13-14 November 2014}, author = {Ahlberg, Malin and Borin, Lars and Dannélls, Dana and Forsberg, Markus and Toporowska Gronostaj, Maria and Friberg Heppin, Karin and Johansson, Richard and Kokkinakis, Dimitrios and Olsson, Leif-Jöran and Uppström, Jonatan}, year = {2014}, } @inProceedings{Pilán-Ildikó2014-210940, title = {Rule-based and machine learning approaches for second language sentence-level readability}, abstract = {We present approaches for the identification of sentences understandable by second language learners of Swedish, which can be used in automatically generated exercises based on corpora. In this work we merged methods and knowledge from machine learning-based readability research, from rule-based studies of Good Dictionary Examples and from second language learning syllabuses. The proposed selection methods have also been implemented as a module in a free web-based language learning platform. Users can use different parameters and linguistic filters to personalize their sentence search with or without a machine learning component assessing readability. The sentences selected have already found practical use as multiple-choice exercise items within the same platform. Out of a number of deep linguistic indicators explored, we found mainly lexical-morphological and semantic features informative for second language sentence-level readability. We obtained a readability classification accuracy result of 71%, which approaches the performance of other models used in similar tasks. Furthermore, during an empirical evaluation with teachers and students, about seven out of ten sentences selected were considered understandable, the rule-based approach slightly outperforming the method incorporating the machine learning model.}, booktitle = {Proceedings of the Ninth Workshop on Innovative Use of NLP for Building Educational Applications, June 26, 2014 Baltimore, Maryland, USA}, author = {Pilán, Ildikó and Volodina, Elena and Johansson, Richard}, year = {2014}, ISBN = {978-1-941643-03-7}, pages = {174----184}, } @inProceedings{Adesam-Yvonne2014-211376, title = {Koala – Korp’s Linguistic Annotations Developing an infrastructure for text-based research with high-quality annotations}, booktitle = {Proceedings of the Fifth Swedish Language Technology Conference, Uppsala, 13-14 November 2014}, author = {Adesam, Yvonne and Borin, Lars and Bouma, Gerlof and Forsberg, Markus and Johansson, Richard}, year = {2014}, } @inProceedings{Bennaceur-Amel2013-158812, title = {Automatic Service Categorisation through Machine Learning in Emergent Middleware}, booktitle = {Lecture notes in computer sciences}, author = {Bennaceur, Amel and Johansson, Richard and Moschitti, Alessandro and Sykes, Daniel and Issarny, Valérie}, year = {2013}, volume = {7542}, pages = {133--149}, } @article{Johansson-Richard2013-158811, title = {Relational Features in Fine-grained Opinion Analysis}, abstract = {Fine-grained opinion analysis often makes use of linguistic features but typically does not take the interaction between opinions into account. This article describes a set of experiments that demonstrate that relational features, mainly derived from dependency-syntactic and semantic role structures, can significantly improve the performance of automatic systems for a number of fine-grained opinion analysis tasks: marking up opinion expressions, finding opinion holders, and determining the polarities of opinion expressions. These features make it possible to model the way opinions expressed in natural-language discourse interact in a sentence over arbitrary distances. The use of relations requires us to consider multiple opinions simultaneously, which makes exact inference intractable. However, a reranker can be used as a sufficiently accurate and efficient approximation. A number of feature sets and machine learning approaches for the rerankers are evaluated. For the task of opinion expression extraction, the best model shows a 10-point absolute improvement in soft recall on the MPQA corpus over a conventional sequence labeler based on local contextual features, while precision decreases only slightly. Significant improvements are also seen for the extended tasks where holders and polarities are considered: 10 and 7 points in recall, respectively. In addition, the systems outperform previously published results for unlabeled (6 F-measure points) and polarity-labeled (10–15 points) opinion expression extraction. Finally, as an extrinsic evaluation, the extracted MPQA-style opinion expressions are used in practical opinion mining tasks. In all scenarios considered, the machine learning features derived from the opinion expressions lead to statistically significant improvement.}, author = {Johansson, Richard and Moschitti, Alessandro}, year = {2013}, volume = {39}, number = {3}, pages = {473--509}, } @inProceedings{Ju-Qi2013-166990, title = {Learning to Rank from Structures in Hierarchical Text Classification}, abstract = {In this paper, we model learning to rank algorithms based on structural dependencies in hierarchical multi-label text categorization (TC). Our method uses the classification probability of the binary classifiers of a standard top-down approach to generate k-best hypotheses. The latter are generated according to their global probability while at the same time satisfy the structural constraints between father and children nodes. The rank is then refined using Support Vector Machines and tree kernels applied to a structural representation of hypotheses, i.e., a hierarchy tree in which the outcome of binary one-vs-all classifiers is directly marked in its nodes. Our extensive experiments on the whole Reuters Corpus Volume 1 show that our models significantly improve over the state of the art in TC, thanks to the use of structural dependecies.}, booktitle = {Advances in Information Retrieval; 35th European Conference on IR Research, ECIR 2013, Moscow, Russia, March 24-27, 2013; P. Serdyukov et al. (ed)}, author = {Ju, Qi and Moschitti, Alessandro and Johansson, Richard}, year = {2013}, volume = {Lecture Notes in Computer Science 7814}, ISBN = {978-3-642-36972-8}, pages = {183--194}, } @inProceedings{Johansson-Richard2013-173587, title = {Training Parsers on Incompatible Treebanks}, abstract = {We consider the problem of training a statistical parser in the situation when there are multiple treebanks available, and these treebanks are annotated according to different linguistic conventions. To address this problem, we present two simple adaptation methods: the first method is based on the idea of using a shared feature representation when parsing multiple treebanks, and the second method on guided parsing where the output of one parser provides features for a second one. To evaluate and analyze the adaptation methods, we train parsers on treebank pairs in four languages: German, Swedish, Italian, and English. We see significant improvements for all eight treebanks when training on the full training sets. However, the clearest benefits are seen when we consider smaller training sets. Our experiments were carried out with unlabeled dependency parsers, but the methods can easily be generalized to other feature-based parsers.}, booktitle = {Proceedings of the 2013 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies}, author = {Johansson, Richard}, year = {2013}, pages = {127--137}, } @inProceedings{Pilán-Ildikó2013-188465, title = {Automatic Selection of Suitable Sentences for Language Learning Exercises}, abstract = {In this study we investigated second and foreign language (L2) sentence readability, an area little explored so far in the case of several languages, including Swedish. The outcome of our research consists of two methods for sentence selection from native language corpora based on Natural Language Processing (NLP) and machine learning (ML) techniques. The two approaches have been made available online within Lärka, an Intelligent CALL (ICALL) platform offering activities for language learners and students of linguistics. Such an automatic selection of suitable sentences can be valuable for L2 teachers during the creation of new teaching materials, for L2 students who look for additional self-study exercises as well as for lexicographers in search of example sentences to illustrate the meaning of a vocabulary item. Members from all these potential user groups evaluated our methods and found the majority of the sentences selected suitable for L2 learning purposes.}, booktitle = {20 Years of EUROCALL: Learning from the Past, Looking to the Future. 2013 EUROCALL Conference, 11th to 14th September 2013 Évora, Portugal, Proceedings.}, author = {Pilán, Ildikó and Volodina, Elena and Johansson, Richard}, year = {2013}, ISBN = {978-1-908416-12-4}, pages = {218--225}, } @inProceedings{Ghosh-Sucheta2013-188844, title = {Mining Fine-grained Opinion Expressions with Shallow Parsing}, abstract = {Opinion analysis deals with public opinions and trends, but subjective language is highly ambiguous. In this paper, we follow a simple data-driven technique to learn fine-grained opinions. We select an intersection set of Wall Street Journal documents that is included both in the Penn Discourse Tree Bank (PDTB) and in the Multi-Perspective Question Answering (MPQA) corpus. This is done in order to explore the usefulness of discourse-level structure to facilitate the extraction of fine-grained opinion expressions. Here we perform shallow parsing of MPQA expressions with connective based discourse structure, and then also with Named Entities (NE) and some syntax features using conditional random fields; the latter feature set is basically a collection of NEs and a bundle of features that is proved to be useful in a shallow discourse parsing task. We found that both of the feature-sets are useful to improve our baseline at different levels of this fine-grained opinion expression mining task.}, booktitle = {Proceedings of the International Conference Recent Advances in Natural Language Processing}, author = {Ghosh, Sucheta and Tonelli, Sara and Johansson, Richard}, year = {2013}, pages = {302--310}, } @inProceedings{Borin-Lars2013-188846, title = {Mining semantics for culturomics: towards a knowledge-based approach}, abstract = {The massive amounts of text data made available through the Google Books digitization project have inspired a new field of big-data textual research. Named culturomics, this field has attracted the attention of a growing number of scholars over recent years. However, initial studies based on these data have been criticized for not referring to relevant work in linguistics and language technology. This paper provides some ideas, thoughts and first steps towards a new culturomics initiative, based this time on Swedish data, which pursues a more knowledge-based approach than previous work in this emerging field. The amount of new Swedish text produced daily and older texts being digitized in cultural heritage projects grows at an accelerating rate. These volumes of text being available in digital form have grown far beyond the capacity of human readers, leaving automated semantic processing of the texts as the only realistic option for accessing and using the information contained in them. The aim of our recently initiated research program is to advance the state of the art in language technology resources and methods for semantic processing of Big Swedish text and focus on the theoretical and methodological advancement of the state of the art in extracting and correlating information from large volumes of Swedish text using a combination of knowledge-based and statistical methods.}, booktitle = {2013 ACM International Workshop on Mining Unstructured Big Data Using Natural Language Processing, UnstructureNLP 2013, Held at 22nd ACM International Conference on Information and Knowledge Management, CIKM 2013; San Francisco, CA; United States; 28 October 2013 through 28 October 2013}, author = {Borin, Lars and Dubhashi, Devdatt and Forsberg, Markus and Johansson, Richard and Kokkinakis, Dimitrios and Nugues, Pierre}, year = {2013}, ISBN = {978-1-4503-2415-1}, pages = {3--10}, } @inProceedings{Johansson-Richard2012-156400, title = {Semantic Role Labeling with the Swedish FrameNet}, abstract = {We present the first results on semantic role labeling using the Swedish FrameNet, which is a lexical resource currently in development. Several aspects of the task are investigated, including the selection of machine learning features, the effect of choice of syntactic parser, and the ability of the system to generalize to new frames and new genres. In addition, we evaluate two methods to make the role label classifier more robust: cross-frame generalization and cluster-based features. Although the small amount of training data limits the performance achievable at the moment, we reach promising results. In particular, the classifier that extracts the boundaries of arguments works well for new frames, which suggests that it already at this stage can be useful in a semi-automatic setting.}, booktitle = {Proceedings of the Eighth International Conference on Language Resources and Evaluation (LREC'12); Istanbul, Turkey; May 23-25}, author = {Johansson, Richard and Friberg Heppin, Karin and Kokkinakis, Dimitrios}, year = {2012}, ISBN = {978-2-9517408-7-7}, pages = {3697--3700}, } @inProceedings{Ghosh-Sucheta2012-156399, title = {Improving the Recall of a Discourse Parser by Constraint-based Postprocessing}, abstract = {We describe two constraint-based methods that can be used to improve the recall of a shallow discourse parser based on conditional random field chunking. These methods use a set of natural structural constraints as well as others that follow from the annotation guidelines of the Penn Discourse Treebank. We evaluated the resulting systems on the standard test set of the PDTB and achieved a rebalancing of precision and recall with improved F-measures across the board. This was especially notable when we used evaluation metrics taking partial matches into account; for these measures, we achieved F-measure improvements of several points.}, booktitle = {Proceedings of the Eighth International Conference on Language Resources and Evaluation (LREC'12); Istanbul, Turkey; May 23-25}, author = {Ghosh, Sucheta and Johansson, Richard and Riccardi, Giuseppe and Tonelli, Sara}, year = {2012}, ISBN = {978-2-9517408-7-7}, pages = {2791--2794}, } @inProceedings{Johansson-Richard2012-156993, title = {Non-atomic Classification to Improve a Semantic Role Labeler for a Low-resource Language}, abstract = {Semantic role classification accuracy for most languages other than English is constrained by the small amount of annotated data. In this paper, we demonstrate how the frame-to-frame relations described in the FrameNet ontology can be used to improve the performance of a FrameNet-based semantic role classifier for Swedish, a low-resource language. In order to make use of the FrameNet relations, we cast the semantic role classification task as a non-atomic label prediction task. The experiments show that the cross-frame generalization methods lead to a 27% reduction in the number of errors made by the classifier. For previously unseen frames, the reduction is even more significant: 50%. }, booktitle = {Proceedings of the First Joint Conference on Lexical and Computational Semantics (*SEM); June 7-8; Montréal, Canada}, author = {Johansson, Richard}, year = {2012}, pages = {95--99}, } @inProceedings{Moschitti-Alessandro2012-156401, title = {Modeling Topic Dependencies in Hierarchical Text Categorization}, abstract = {In this paper, we encode topic dependencies in hierarchical multi-label Text Categorization (TC) by means of rerankers. We represent reranking hypotheses with several innovative kernels considering both the structure of the hierarchy and the probability of nodes. Additionally, to better investigate the role of category relationships, we consider two interesting cases: (i) traditional schemes in which node-fathers include all the documents of their child-categories; and (ii) more general schemes, in which children can include documents not belonging to their fathers. The extensive experimentation on Reuters Corpus Volume 1 shows that our rerankers inject effective structural semantic dependencies in multi-classifiers and significantly outperform the state of the art.}, booktitle = {Proceedings of the 50th Annual Meeting of the Association for Computational Linguistics (ACL 2012); Jeju, Korea; July 8-14}, author = {Moschitti, Alessandro and Ju, Qi and Johansson, Richard}, year = {2012}, pages = {759--767}, } @inProceedings{Borin-Lars2012-157213, title = {Transferring Frames: Utilization of Linked Lexical Resources}, abstract = {In our experiment, we evaluate the transferability of frames from Swedish to Finnish in parallel corpora. We evaluate both the theoretical possibility of transferring frames and the possibility of performing it using available lexical resources. We add the frame information to an extract of the Swedish side of the Kotus and JRC-Acquis corpora using an automatic frame labeler and copy it to the Finnish side. We focus on evaluating the results to get an estimation on how often the parallel sentences can be said to express the same frame. This sheds light to the questions: Are the same situations in the two languages expressed using different frames, i.e. are the frames transferable even in theory? How well can the frame information of running text be transferred from language to another? }, booktitle = {Proceedings of the Workshop on Inducing Linguistic Structure Submission (WILS)}, author = {Borin, Lars and Forsberg, Markus and Johansson, Richard and Muhonen, Kristiina and Purtonen, Tanja and Voionmaa, Kaarlo}, year = {2012}, pages = {8--15}, } @inProceedings{Borin-Lars2012-157338, title = {Search Result Diversification Methods to Assist Lexicographers}, abstract = {We show how the lexicographic task of finding informative and diverse example sentences can be cast as a search result diversification problem, where an objective based on relevance and diversity is maximized. This problem has been studied intensively in the information retrieval community during recent years, and efficient algorithms have been devised. We finally show how the approach has been implemented in a lexicographic project, and describe the relevance and diversity functions used in that context. }, booktitle = {Proceedings of the 6th Linguistic Annotation Workshop}, author = {Borin, Lars and Forsberg, Markus and Friberg Heppin, Karin and Johansson, Richard and Kjellandsson, Annika}, year = {2012}, pages = {113--117}, } @inProceedings{Ghosh-Sucheta2012-157440, title = {Global Features for Shallow Discourse Parsing}, abstract = {A coherently related group of sentences may be referred to as a discourse. In this paper we address the problem of parsing coherence relations as defined in the Penn Discourse Tree Bank (PDTB). A good model for discourse structure analysis needs to account both for local dependencies at the token-level and for global dependencies and statistics. We present techniques on using inter-sentential or sentence-level (global), data-driven, non-grammatical features in the task of parsing discourse. The parser model follows up previous approach based on using token-level (local) features with conditional random fields for shallow discourse parsing, which is lacking in structural knowledge of discourse. The parser adopts a two-stage approach where first the local constraints are applied and then global constraints are used on a reduced weighted search space (n-best). In the latter stage we experiment with different rerankers trained on the first stage n-best parses, which are generated using lexico-syntactic local features. The two-stage parser yields significant improvements over the best performing model of discourse parser on the PDTB corpus.}, booktitle = {Proceedings of the 13th Annual Meeting of the Special Interest Group on Discourse and Dialogue (SIGDIAL)}, author = {Ghosh, Sucheta and Riccardi, Giuseppe and Johansson, Richard}, year = {2012}, pages = {150--159}, } @inProceedings{Bennaceur-Amel2012-160393, title = {Machine Learning for Emergent Middleware}, abstract = {Highly dynamic and heterogeneous distributed systems are challenging today's middleware technologies. Existing middleware paradigms are unable to deliver on their most central promise, which is offering interoperability. In this paper, we argue for the need to dynamically synthesise distributed system infrastructures according to the current operating environment, thereby generating "Emergent Middleware'' to mediate interactions among heterogeneous networked systems that interact in an ad hoc way. The paper outlines the overall architecture of Enablers underlying Emergent Middleware, and in particular focuses on the key role of learning in supporting such a process, spanning statistical learning to infer the semantics of networked system functions and automata learning to extract the related behaviours of networked systems.}, booktitle = {Proceedings of the Joint Workshop on Intelligent Methods for Software System Engineering (JIMSE)}, author = {Bennaceur, Amel and Howar, Falk and Issarny, Valérie and Johansson, Richard and Moschitti, Alessandro and Spalazzese, Romina and Steffen, Bernhard and Sykes, Daniel}, year = {2012}, volume = {Accepted}, } @inProceedings{Johansson-Richard2012-163602, title = {Bridging the Gap between Two Different Swedish Treebanks}, abstract = {We present two simple adaptation methods to train a dependency parser in the situation when there are multiple treebanks available, and these treebanks are annotated according to different linguistic conventions. To test the methods, we train parsers on the Talbanken and Syntag treebanks of Swedish. The results show that the methods are effective for low-to-medium training set sizes.}, booktitle = {Proceedings of the Fourth Swedish Language Technology Conference (SLTC)}, author = {Johansson, Richard}, year = {2012}, volume = {Accepted}, } @inProceedings{Volodina-Elena2012-165961, title = {Semi-automatic selection of best corpus examples for Swedish: Initial algorithm evaluation.}, abstract = {The study presented here describes the results of the initial evaluation of two sorting approaches to automatic ranking of corpus examples for Swedish. Representatives from two potential target user groups have been asked to rate top three hits per approach for sixty search items from the point of view of the needs of their professional target groups, namely second/foreign language (L2) teachers and lexicographers. This evaluation has shown, on the one hand, which of the two approaches to example rating (called in the text below algorithms #1 and #2) performs better in terms of finding better examples for each target user group; and on the other hand, which features evaluators associate with good examples. It has also facilitated statistic analysis of the “good” versus “bad” examples with reference to the measurable features, such as sentence length, word length, lexical frequency profiles, PoS constitution, dependency structure, etc. with a potential to find out new reliable classifiers.}, booktitle = {Proceedings of the SLTC 2012 workshop on NLP for CALL, Lund, 25th October, 2012. }, author = {Volodina, Elena and Johansson, Richard and Johansson Kokkinakis, Sofie}, year = {2012}, number = {080}, pages = {59--70}, } @inProceedings{Ghosh-Sucheta2011-151350, title = {End-to-End Discourse Parser Evaluation}, abstract = {We are interested in the problem of discourse parsing of textual documents. We present a novel end-to-end discourse parser that, given a plain text document in input, identifies the discourse relations in the text, assigns them a semantic label and detects discourse arguments spans. The parsing architecture is based on a cascade of decisions supported by Conditional Random Fields (CRF). We train and evaluate three different parsers using the PDTB corpus. The three system versions are compared to evaluate their robustness with respect to deep/shallow and automatically extracted syntactic features.}, booktitle = {Fifth IEEE International Conference on Semantic Computing (ICSC), 2011; September 18-21, 2011; Palo Alto, United States}, author = {Ghosh, Sucheta and Tonelli, Sara and Riccardi, Giuseppe and Johansson, Richard}, year = {2011}, ISBN = {978-1-4577-1648-5}, pages = {169--172}, } @inProceedings{Ghosh-Sucheta2011-151356, title = {Shallow Discourse Parsing with Conditional Random Fields}, abstract = {Parsing discourse is a challenging natural language processing task. In this paper we take a data driven approach to identify arguments of explicit discourse connectives. In contrast to previous work we do not make any assumptions on the span of arguments and consider parsing as a token-level sequence labeling task. We design the argument segmentation task as a cascade of decisions based on conditional random fields (CRFs). We train the CRFs on lexical, syntactic and semantic features extracted from the Penn Discourse Treebank and evaluate feature combinations on the commonly used test split. We show that the best combination of features includes syntactic and semantic features. The comparative error analysis investigates the performance variability over connective types and argument positions.}, booktitle = {Proceedings of 5th International Joint Conference on Natural Language Processing; editors Haifeng Wang and David Yarowsky; Chiang Mai, Thailand; November 8-13, 2011}, author = {Ghosh, Sucheta and Johansson, Richard and Riccardi, Giuseppe and Tonelli, Sara}, year = {2011}, pages = {1071--1079}, } @inProceedings{Ju-Qi2011-151361, title = {Towards Using Reranking in Hierarchical Classification}, abstract = {We consider the use of reranking as a way to relax typical in- dependence assumptions often made in hierarchical multilabel classification. Our reranker is based on (i) an algorithm that generates promising k-best classification hypotheses from the output of local binary classifiers that clas- sify nodes of a target tree-shaped hierarchy; and (ii) a tree kernel-based reranker applied to the classification tree associated with the hypotheses above. We carried out a number of experiments with this model on the Reuters corpus: we firstly show the potential of our algorithm by computing the oracle classification accuracy. This demonstrates that there is a signifi- cant room for potential improvement of the hierarchical classifier. Then, we measured the accuracy achieved by the reranker, which shows a significant performance improvement over the baseline. }, booktitle = {Proceedings of the Joint ECML/PKDD-PASCAL Workshop on Large-Scale Hierarchical Classification; September 5, 2011; Athens, Greece}, author = {Ju, Qi and Johansson, Richard and Moschitti, Alessandro}, year = {2011}, }