@inProceedings{francois-etal-2016-svalex-248142, title = {SVALex: a CEFR-graded lexical resource for Swedish foreign and second language learners.}, abstract = {The paper introduces SVALex, a lexical resource primarily aimed at learners and teachers of Swedish as a foreign and second language that describes the distribution of 15,681 words and expressions across the Common European Framework of Reference (CEFR). The resource is based on a corpus of coursebook texts, and thus describes receptive vocabulary learners are exposed to during reading activities, as opposed to productive vocabulary they use when speaking or writing. The paper describes the methodology applied to create the list and to estimate the frequency distribution. It also discusses some chracteristics of the resulting resource and compares it to other lexical resources for Swedish. An interesting feature of this resource is the possibility to separate the wheat from the chaff, identifying the core vocabulary at each level, i.e. vocabulary shared by several coursebook writers at each level, from peripheral vocabulary which is used by the minority of the coursebook writers.}, booktitle = {Proceedings of the Tenth International Conference on Language Resources and Evaluation (LREC 2016), May 23-28, 2016 Portorož, Slovenia}, author = {François, Thomas and Volodina, Elena and Pilán, Ildikó and Tack, Anaïs }, year = {2016}, publisher = {European Language Resources Association}, address = {Paris}, ISBN = {978-2-9517408-9-1}, } @inProceedings{daudaravicius-etal-2016-report-248143, title = {A report on the Automatic Evaluation of Scientific Writing Shared Task.}, abstract = {The Automated Evaluation of Scientific Writing, or AESW, is the task of identifying sentences in need of correction to ensure their appropriateness in a scientific prose. The data set comes from a professional editing company, VTeX, with two aligned versions of the same text – before and after editing – and covers a variety of textual infelicities that proofreaders have edited. While previous shared tasks focused solely on grammatical errors (Dale and Kilgarriff, 2011; Dale et al., 2012; Ng et al., 2013; Ng et al., 2014), this time edits cover other types of linguistic misfits as well, including those that almost certainly could be interpreted as style issues and similar “matters of opinion”. The latter arise because of different language editing traditions, experience, and the absence of uniform agreement on what “good” scientific language should look like. Initiating this task, we expected the participating teams to help identify the characteristics of “good” scientific language, and help create a consensus of which language improvements are acceptable (or necessary). Six participating teams took on the challenge.}, booktitle = {Workshop on Innovative Use of NLP for Building Educational Applications, June 16, 2016, San Diego, CA, USA}, author = {Daudaravicius, Vidas and E. Banchs, Rafael and Volodina, Elena and Napoles, Courtney }, year = {2016}, ISBN = {978-1-941643-83-9}, } @inProceedings{borin-etal-2016-towards-253952, title = {Towards a Big Data View on South Asian Linguistic Diversity}, abstract = {South Asia with its rich and diverse linguistic tapestry of hundreds of languages, including many from four major language families, and a long history of intensive language contact, provides rich empirical data for studies of linguistic genealogy, linguistic typology, and language contact. South Asia is often referred to as a linguistic area, a region where, due to close contact and widespread multilingualism, languages have influenced one another to the extent that both related and unrelated languages are more similar on many linguistic levels than we would expect. However, with some rare exceptions, most studies are largely impressionistic, drawing examples from a few languages. In this paper we present our ongoing work aiming at turning the linguistic material available in Grierson’s Linguistic Survey of India (LSI) into a digital language resource, a database suitable for a broad array of linguistic investigations of the languages of South Asia. In addition to this, we aim to contribute to the methodological development of large-scale comparative linguistics drawing on digital language resources, by exploring NLP techniques for extracting linguistic information from free-text language descriptions of the kind found in the LSI.}, booktitle = {WILDRE-3 – 3rd Workshop on Indian Language Data: Resources and Evaluation}, author = {Borin, Lars and Virk, Shafqat and Saxena, Anju}, year = {2016}, publisher = {ELRA}, address = {Paris}, } @inProceedings{r?dveneide-etal-2016-swedish-250073, title = {The Swedish Culturomics Gigaword Corpus: A One Billion Word Swedish Reference Dataset for NLP}, abstract = {In this paper we present a dataset of contemporary Swedish containing one billion words. The dataset consists of a wide range of sources, all annotated using a state-of-the-art corpus annotation pipeline, and is intended to be a static and clearly versioned dataset. This will facilitate reproducibility of experiments across institutions and make it easier to compare NLP algorithms on contemporary Swedish. The dataset contains sentences from 1950 to 2015 and has been carefully designed to feature a good mix of genres balanced over each included decade. The sources include literary, journalistic, academic and legal texts, as well as blogs and web forum entries.}, booktitle = {Linköping Electronic Conference Proceedings. Digital Humanities 2016. From Digitization to Knowledge 2016: Resources and Methods for Semantic Processing of Digital Works/Texts, July 11, 2016, Krakow, Poland}, author = {Rødven-Eide, Stian and Tahmasebi, Nina and Borin, Lars}, year = {2016}, publisher = {Linköping University Electronic Press}, address = {Linköping}, ISBN = {978-91-7685-733-5}, } @misc{volodina-etal-2016-preface-248087, title = {Preface. Proceedings of the joint workshop on NLP for Computer Assisted Language Learning and NLP for Language Acquisition at SLTC, Umeå, 16th November 2016}, abstract = {The joint workshop on Natural Language Processing (NLP) for Computer-Assisted Language Learning (CALL) & NLP for Language Acquisition (LA) – shorthand NLP4CALL&LA – is an effort to provide a debate space and collaboration between two closely related areas. Both focus on language acquisition, related resources and technologies, that can support research of the language learning process as well as aim to bring interdisciplinary advantage to the field. Individual workshop areas are outlined below. The area of NLP4CALL is applied in essence, where tools, algorithms, and ready-to-use programs play an important role. It has a traditional focus on second or foreign language learning, and the target age group of school children or older. The intersection of Natural Language Processing and Speech Technology, with Computer-Assisted Language Learning (CALL) brings “understanding” of language to CALL tools, thus making CALL intelligent. This fact has provided the name for this area of research – Intelligent CALL, ICALL. As the definition suggests, apart from having excellent knowledge of Natural Language Processing and/or Speech Technology, ICALL researchers need good insights into second language acquisition (SLA) theories and practices, second language assessment, as well as knowledge of L2 pedagogy and didactics. The workshop on Language Processing for Research in Language Acquisition (NLP4LA) broadens the scope of the joint workshop to also include theoretical, empirical, and experimental investigation of first, second and bilingual language acquisition. NLP4LA aims to foster collaboration between the NLP, linguistics, psychology and cognitive science communities. The workshop is targeted at anyone interested in the relevance of computational techniques for first, second and bilingual language acquisition. The joint workshop series on NLP4CALL&LA has arisen in 2016 and has become a meeting place for researchers working on the integration of Natural Language Processing and Speech Technologies in systems supporting language learning and research around it, and exploring the theoretical and methodological issues arising during language acquisition. }, author = {Volodina, Elena and Grigonytė, Gintarė and Pilán, Ildikó and Nilsson Björkenstam, Kristina and Borin, Lars}, year = {2016}, number = {130}, pages = { i–viii}, } @misc{volodina-etal-2016-proceedings-248081, title = {Proceedings of the joint workshop on NLP for Computer Assisted Language Learning and NLP for Language Acquisition at SLTC, Umeå, 16th November 2016}, abstract = {The joint workshop on Natural Language Processing (NLP) for Computer-Assisted Language Learning (CALL) & NLP for Language Acquisition (LA) – shorthand NLP4CALL&LA – is an effort to provide a debate space and collaboration between two closely related areas. Both focus on language acquisition, related resources and technologies, that can support research of the language learning process as well as aim to bring interdisciplinary advantage to the field. Individual workshop areas are outlined below. The area of NLP4CALL is applied in essence, where tools, algorithms, and ready-to-use programs play an important role. It has a traditional focus on second or foreign language learning, and the target age group of school children or older. The intersection of Natural Language Processing and Speech Technology, with Computer-Assisted Language Learning (CALL) brings “understanding” of language to CALL tools, thus making CALL intelligent. This fact has provided the name for this area of research – Intelligent CALL, ICALL. As the definition suggests, apart from having excellent knowledge of Natural Language Processing and/or Speech Technology, ICALL researchers need good insights into second language acquisition (SLA) theories and practices, second language assessment, as well as knowledge of L2 pedagogy and didactics. The workshop on Language Processing for Research in Language Acquisition (NLP4LA) broadens the scope of the joint workshop to also include theoretical, empirical, and experimental investigation of first, second and bilingual language acquisition. NLP4LA aims to foster collaboration between the NLP, linguistics, psychology and cognitive science communities. The workshop is targeted at anyone interested in the relevance of computational techniques for first, second and bilingual language acquisition. The joint workshop series on NLP4CALL&LA has arisen in 2016 and has become a meeting place for researchers working on the integration of Natural Language Processing and Speech Technologies in systems supporting language learning and research around it, and exploring the theoretical and methodological issues arising during language acquisition.}, author = {Volodina, Elena and Grigonytė, Gintarė and Pilán, Ildikó and Nilsson Björkenstam, Kristina and Borin, Lars}, year = {2016}, publisher = {Linköping University Electronic Press}, address = {Linköping}, ISBN = {978-91-7685-633-8}, } @incollection{borin-2016-lexikografi-246607, title = {Lexikografi för maskiner och lexikografi för människor}, booktitle = {Framtidens lexikografi: Rapport från ett symposium i Göteborg 5 oktober 2012}, author = {Borin, Lars}, year = {2016}, publisher = {Meijerbergs institut vid Göteborgs universitet}, address = {Göteborg}, ISBN = {978-91-87850-01-1}, pages = {9--27}, } @inProceedings{borin-kosinski-2016-towards-238147, title = {Towards interactive visualization of public discourse in time and space}, abstract = {We report on a proof-of-concept study where we (1) apply NLP tools for extracting political-discourse topics from a large Swedish Twitter dataset; and (2) design an interactive spatiotemporal visualization application allowing humanities and social-science scholars to explore how the tweet topics vary over space and time.}, booktitle = {Linköping Electronic Conference Proceedings}, author = {Borin, Lars and Kosiński, Tomasz}, year = {2016}, volume = {126}, ISBN = {978-91-7685-733-5}, pages = {1--7}, } @inProceedings{pilan-etal-2016-predicting-247240, title = {Predicting proficiency levels in learner writings by transferring a linguistic complexity model from expert-written coursebooks}, abstract = {The lack of a sufficient amount of data tailored for a task is a well-recognized problem for many statistical NLP methods. In this paper, we explore whether data sparsity can be successfully tackled when classifying language proficiency levels in the domain of learner-written output texts. We aim at overcoming data sparsity by incorporating knowledge in the trained model from another domain consisting of input texts written by teaching professionals for learners. We compare different domain adaptation techniques and find that a weighted combination of the two types of data performs best, which can even rival systems based on considerably larger amounts of in-domain data. Moreover, we show that normalizing errors in learners’ texts can substantially improve classification when in-domain data with annotated proficiency levels is not available.}, booktitle = {Proceedings of the 26th International Conference on Computational Linguistics (COLING), December 13-16, 2016, Osaka}, author = {Pilán, Ildikó and Volodina, Elena and Zesch, Torsten}, year = {2016}, ISBN = {978-4-87974-702-0}, } @inProceedings{volodina-etal-2016-swell-248141, title = {SweLL on the rise: Swedish Learner Language corpus for European Reference Level studies.}, abstract = {We present a new resource for Swedish, SweLL, a corpus of Swedish Learner essays linked to learners’ performance according to the Common European Framework of Reference (CEFR). SweLL consists of three subcorpora – SpIn, SW1203 and Tisus, collected from three different educational establishments. The common metadata for all subcorpora includes age, gender, native languages, time of residence in Sweden, type of written task. Depending on the subcorpus, learner texts may contain additional information, such as text genres, topics, grades. Five of the six CEFR levels are represented in the corpus: A1, A2, B1, B2 and C1 comprising in total 339 essays. C2 level is not included since courses at C2 level are not offered. The work flow consists of collection of essays and permits, essay digitization and registration, meta-data annotation, automatic linguistic annotation. Inter-rater agreement is presented on the basis of SW1203 subcorpus. The work on SweLL is still ongoing with more that 100 essays waiting in the pipeline. This article both describes the resource and the “how-to” behind the compilation of SweLL.}, booktitle = {Proceedings of the Tenth International Conference on Language Resources and Evaluation (LREC 2016), May 23-28, 2016, Portorož, Slovenia}, author = {Volodina, Elena and Pilán, Ildikó and Enström, Ingegerd and Llozhi, Lorena and Lundkvist, Peter and Sundberg, Gunlög and Sandell, Monica }, year = {2016}, publisher = {European Language Resources Association}, address = {Paris}, ISBN = {978-2-9517408-9-1}, } @inProceedings{kokkinakis-2016-linguistic-243100, title = {Linguistic and extra-linguistic parameters for early detection of cognitive impairment}, abstract = {AIM: to adapt, develop and test methods that in isolation have shown promising outcomes on tasks related to (early) detection of dementia, differentiating between various dementia types and controls and also increase our understanding of the cognitive processes that underlie written text and certain forms of spoken language production. Unlike previous models, based solely on a certain aspect of language abilities (i.e. on written or spoken language alone), the project is comprehensive and more likely to provide new insights in the area of dementia detection and improve practices applied so far. The project builds on the success stories of the past and focus on the interplay between various types of technologies that hold the potential to provide reliable estimates for the detection of cognitive decline. The project emphasizes its interdisciplinary nature, by bringing together researchers from humanities (computational linguistics / language technology), computer science and medicine, and foresees the development of a comprehensive set of novel analytic approaches not explored jointly in the past GOAL: discovering evidence about linguistic performance and identifying whether the addition of new ways for investigating, combining and evaluating measurement and other parameters for improvement of established models can advance our understanding of: i) the boundaries between normal aging and dementia; ii) its effects on linguistic performance extrapolated from various sources and iii) whether effects of cognitive decline can be seen across (daily) language production. }, booktitle = {European Summer School on Eye Movements (ESSEM), 11-17 september, 2016 Athens, Greece.}, author = {Kokkinakis, Dimitrios}, year = {2016}, } @inProceedings{tahmasebi-etal-2016-clarin-233899, title = {SWE-CLARIN – the Swedish CLARIN project – aims and activities}, booktitle = {Digital Humanities in the Nordic countries, Oslo, March 15-17 2016}, author = {Tahmasebi, Nina and Borin, Lars and Jordan, Caspar and Ekman, Stefan}, year = {2016}, pages = {122--123}, } @inProceedings{bouma-adesam-2016-multiword-251825, title = {Multiword Annotation in the Eukalyptus Treebank of Written Swedish}, booktitle = {PARSEME, 6th general meeting, 7-8 April 2016, Struga, FYR Macedonia }, author = {Bouma, Gerlof and Adesam, Yvonne}, year = {2016}, } @inProceedings{bouma-adesam-2016-part-254389, title = {Part-of-speech and Morphology Tagging Old Swedish}, booktitle = {Proceedings of the Sixth Swedish Language Technology Conference (SLTC) Umeå University, 17-18 November, 2016}, author = {Bouma, Gerlof and Adesam, Yvonne}, year = {2016}, } @inProceedings{forsberg-hulden-2016-deriving-237061, title = {Deriving Morphological Analyzers from Example Inflections}, abstract = {This paper presents a semi-automatic method to derive morphological analyzers from a limited number of example inflections suitable for languages with alphabetic writing systems. The system we present learns the inflectional behavior of morphological paradigms from examples and converts the learned paradigms into a finite-state transducer that is able to map inflected forms of previously unseen words into lemmas and corresponding morphosyntactic descriptions. We evaluate the system when provided with inflection tables for several languages collected from the Wiktionary.}, booktitle = {Proceedings of the Tenth International Conference on Language Resources and Evaluation (LREC-2016) May 23-28, 2016, Portorož, Slovenia}, author = {Forsberg, Markus and Hulden, Mans}, year = {2016}, ISBN = {978-2-9517408-9-1}, } @inProceedings{adesam-bouma-2016-swedish-251827, title = {Old Swedish Part-of-Speech Tagging between Variation and External Knowledge}, booktitle = {Proceedings of the 10th SIGHUM Workshop on Language Technology for Cultural Heritage, Social Sciences, and Humanities, Berlin, Germany, August 11, 2016}, author = {Adesam, Yvonne and Bouma, Gerlof}, year = {2016}, publisher = {Association for Computational Linguistics}, address = {Stroudsburg, PA }, ISBN = {978-1-945626-09-8}, } @inProceedings{ahlberg-etal-2016-sprakbanken's-246063, title = {Språkbanken’s Open Lexical Infrastructure}, abstract = {Karp is an open lexical infrastructure and a web based tool for searching, exploring and developing lexical resources. Språkbanken currently hosts a number of lexicons in Karp and on-going work aims at broadening the type of resources that can be developed in the system. This abstract gives a short overview of Karp's basic functionality, and describes some current projects and on-going work.}, booktitle = {SLTC 2016. The Sixth Swedish Language Technology Conference. Umeå University, 17-18 November, 2016}, author = {Ahlberg, Malin and Borin, Lars and Forsberg, Markus and Olsson, Olof and Schumacher, Anne and Uppström, Jonatan}, year = {2016}, } @inProceedings{ahlberg-etal-2016-karp-246072, title = {Karp: Språkbanken’s Open Lexical Infrastructure}, booktitle = {Globalex 2016, May 24, Portorož, Slovenia}, author = {Ahlberg, Malin and Borin, Lars and Forsberg, Markus and Olsson, Olof and Schumacher, Anne and Uppström, Jonatan}, year = {2016}, } @inProceedings{borin-etal-2016-sparv-246053, title = {Sparv: Språkbanken’s corpus annotation pipeline infrastructure}, abstract = {Sparv is Språkbanken's corpus annotation pipeline infrastructure. The easiest way to use the pipeline is from its web interface with a plain text document. The pipeline uses in-house and external tools on the text to segment it into sentences and paragraphs, tokenise, tag parts-of-speech, look up in dictionaries and analyse compounds. The pipeline can also be run using a web API with XML results, and it is run locally at Språkbanken to prepare the documents in Korp, our corpus search tool. While the most sophisticated support is for modern Swedish, the pipeline supports 15 languages.}, booktitle = {SLTC 2016. The Sixth Swedish Language Technology Conference, Umeå University, 17-18 November, 2016}, author = {Borin, Lars and Forsberg, Markus and Hammarstedt, Martin and Rosén, Dan and Schäfer, Roland and Schumacher, Anne}, year = {2016}, } @inProceedings{nietopina-johansson-2016-embedding-241139, title = {Embedding Senses for Efficient Graph-based Word Sense Disambiguation}, abstract = {We propose a simple graph-based method for word sense disambiguation (WSD) where sense and context embeddings are constructed by applying the Skip-gram method to random walks over the sense graph. We used this method to build a WSD system for Swedish using the SALDO lexicon, and evaluated it on six different annotated test sets. In all cases, our system was several orders of magnitude faster than a state-of-the-art PageRank-based system, while outperforming a random baseline soundly.}, booktitle = { Proceedings of TextGraphs-10: the Workshop on Graph-based Methods for Natural Language Processing}, author = {Nieto Piña, Luis and Johansson, Richard}, year = {2016}, publisher = {Association for Computational Linguistics}, } @article{lindstromtiedemann-etal-2016-larka-248112, title = {Lärka: ett verktyg för träning av språkterminologi och grammatik}, abstract = {Lärka is a corpus-based tool, which allows students to practise and learn grammar based on authentic material. In this study we present how this has been used at four universities. We also use our logs to try to assess the students metalinguistic awareness in relation to international studies, and discuss how these logs can be used in the future.}, journal = {LexicoNordica}, author = {Lindström Tiedemann, Therese and Volodina, Elena and Jansson, Håkan}, year = {2016}, volume = {23}, pages = {161--181}, } @inProceedings{adouane-etal-2016-romanized-246849, title = {Romanized Berber and Romanized Arabic Automatic Language Identification Using Machine Learning}, abstract = {The identification of the language of text/speech input is the first step to be able to properly do any language-dependent natural language processing. The task is called Automatic Language Identification (ALI). Being a well-studied field since early 1960’s, various methods have been applied to many standard languages. The ALI standard methods require datasets for training and use character/word-based n-gram models. However, social media and new technologies have contributed to the rise of informal and minority languages on the Web. The state-of-the-art automatic language identifiers fail to properly identify many of them. Romanized Arabic (RA) and Romanized Berber (RB) are cases of these informal languages which are under-resourced. The goal of this paper is twofold: detect RA and RB, at a document level, as separate languages and distinguish between them as they coexist in North Africa. We consider the task as a classification problem and use supervised machine learning to solve it. For both languages, character-based 5-grams combined with additional lexicons score the best, F-score of 99.75% and 97.77% for RB and RA respectively.}, booktitle = {Proceedings of the Third Workshop on NLP for Similar Languages, Varieties and Dialects; 53–61; December 12, 2016 ; Osaka, Japan}, author = {Adouane, Wafia and Semmar, Nasredine and Johansson, Richard}, year = {2016}, publisher = {Association for Computational Linguistics}, } @article{nietopina-johansson-2016-benchmarking-251412, title = {Benchmarking Word Sense Disambiguation Systems for Swedish}, abstract = {We compare several word sense disambiguation systems for Swedish and evaluate them on seven different sense-annotated corpora. Our results show that unsupervised systems beat a random baseline, but generally do not outperform a first-sense baseline considerably. On a lexical-sample dataset that allows us to train a supervised system, the unsupervised disambiguators are strongly outperformed by the supervised one.}, journal = {The Sixth Swedish Language Technology Conference}, author = {Nieto Piña, Luis and Johansson, Richard}, year = {2016}, } @inProceedings{pilan-2016-detecting-243715, title = {Detecting Context Dependence in Exercise Item Candidates Selected from Corpora}, abstract = {We explore the factors influencing the dependence of single sentences on their larger textual context in order to automatically identify candidate sentences for language learning exercises from corpora which are presentable in isolation. An in-depth investigation of this question has not been previously carried out. Understanding this aspect can contribute to a more efficient selection of candidate sentences which, besides reducing the time required for item writing, can also ensure a higher degree of variability and authenticity. We present a set of relevant aspects collected based on the qualitative analysis of a smaller set of context-dependent corpus example sentences. Furthermore, we implemented a rule-based algorithm using these criteria which achieved an average precision of 0.76 for the identification of different issues related to context dependence. The method has also been evaluated empirically where 80% of the sentences in which our system did not detect context-dependent elements were also considered context-independent by human raters.}, booktitle = {Proceedings of the 11th Workshop on Innovative Use of NLP for Building Educational Applications, June 12 to June 17, 2016, San Diego, USA}, author = {Pilán, Ildikó}, year = {2016}, } @inProceedings{adouane-etal-2016-romanized-255457, title = {Romanized Arabic and Berber Detection Using Prediction by Partial Matching and Dictionary Methods}, abstract = {Arabic is one of the Semitic languages written in Arabic script in its standard form. However, the recent rise of social media and new technologies has contributed considerably to the emergence of a new form of Arabic, namely Arabic written in Latin scripts, often called Romanized Arabic or Arabizi. While Romanized Arabic is an informal language, Berber or Tamazight uses Latin script in its standard form with some orthography differences depending on the country it is used in. Both these languages are under-resourced and unknown to the state-of-theart language identifiers. In this paper, we present a language automatic identifier for both Romanized Arabic and Romanized Berber. We also describe the built linguistic resources (large dataset and lexicons) including a wide range of Arabic dialects (Algerian, Egyptian, Gulf, Iraqi, Levantine, Moroccan and Tunisian dialects) as well as the most popular Berber varieties (Kabyle, Tashelhit, Tarifit, Tachawit and Tamzabit). We use the Prediction by Partial Matching (PPM) and dictionary-based methods. The methods reach a macro-average F-Measure of 98.74% and 97.60% respectively.}, booktitle = {2016 IEEE/ACS 13TH INTERNATIONAL CONFERENCE OF COMPUTER SYSTEMS AND APPLICATIONS (AICCSA)}, author = {Adouane, Wafia and Semmar, N. and Johansson, Richard}, year = {2016}, ISBN = {978-1-5090-4320-0}, } @inProceedings{kokkinakis-etal-2016-specifications-243183, title = {Specifications and Methodology for Language-Related Data Acquisition and Analysis in the Domain of Dementia Diagnostics}, abstract = {This paper outlines the initial stages of a project that aims to build and use a corpus with data samples acquired from people diagnosed with subjective or mild cognitive impairment and healthy, age-matched controls. The data we are currently collecting consists of audio-recorded spoken language samples; transcripts of the audio recordings and eye tracking measurements. From these data we plan to extract, evaluate and model features to be used for learning classification models in order to test how well a differentiation between the aforementioned subject groups can be made. Features will be also correlated with outcomes from e.g. other language-related scores, such as word fluency, in order to investigate whether there are relationships between various variables.}, booktitle = { The Sixth Swedish Language Technology Conference (SLTC) Umeå University, 17-18 November, 2016}, author = {Kokkinakis, Dimitrios and Lundholm Fors, Kristina and Björkner, Eva and Nordlund, Arto}, year = {2016}, } @inProceedings{lundholmfors-breitholtz-2016-mocking-240344, title = {Are you mocking me or are you laughing with me?}, booktitle = { SEMDIAL 2016, JerSem, Proceedings of the 20th Workshop on the Semantics and Pragmatics of Dialogue, 16-18 July 2016 Rutgers, New Brunswick, NJ, USA / Julie Hunter, Mandy Simons, and Matthew Stone (eds.)}, author = {Lundholm Fors, Kristina and Breitholtz, Ellen}, year = {2016}, } @inProceedings{johansson-etal-2016-multi-233140, title = {A Multi-domain Corpus of Swedish Word Sense Annotation}, abstract = {We describe the word sense annotation layer in Eukalyptus, a freely available five-domain corpus of contemporary Swedish with several annotation layers. The annotation uses the SALDO lexicon to define the sense inventory, and allows word sense annotation of compound segments and multiword units. We give an overview of the new annotation tool developed for this project, and finally present an analysis of the inter-annotator agreement between two annotators. }, booktitle = {10th edition of the Language Resources and Evaluation Conference, 23-28 May 2016, Portorož (Slovenia)}, author = {Johansson, Richard and Adesam, Yvonne and Bouma, Gerlof and Hedberg, Karin}, year = {2016}, publisher = {European Language Resources Association}, ISBN = {978-2-9517408-9-1}, } @inProceedings{kelly-etal-2016-identifying-242814, title = {Identifying Perceptually Similar Voices with a Speaker Recognition System Using Auto-Phonetic Features}, booktitle = {17th Annual Conference of the International-Speech-Communication-Association (Interspeech 2016). San Francisco, CA, USA. 8-12 september 2016.}, author = {Kelly, Finnian and Alexander, Anil and Forth, Oscar and Kent, Samuel and Lindh, Jonas and Åkesson, Joel}, year = {2016}, pages = {1567----1568}, } @inProceedings{lindh-akesson-2016-evaluation-242811, title = {Evaluation of Software ‘Error checks’ on the SweEval2016 Corpus for Forensic Speaker Comparison}, booktitle = {Proceedings of IAFPA25. 25th Annual Conference of the International Association for Forensic Phonetics and Acoustics. York, UK 24th – 27th July 2016}, author = {Lindh, Jonas and Åkesson, Joel}, year = {2016}, pages = {57--58}, } @inProceedings{kelly-etal-2016-automatically-242810, title = {Automatically identifying perceptually similar voices for voice parades}, booktitle = {Proceedings of IAFPA25. 25th Annual Conference of the International Association for Forensic Phonetics and Acoustics. York, UK 24th – 27th July 2016}, author = {Kelly, Finnian and Alexander, Anil and Forth, Oscar and Kent, Samuel and Lindh, Jonas and Åkesson, Joel}, year = {2016}, pages = {25--26}, } @inProceedings{alfter-bizzoni-2016-hybrid-246348, title = {Hybrid Language Segmentation for Historical Documents}, booktitle = {Proceedings CLiC-it 2016 and EVALITA 2016, Napoli, Italy, December 5-7, 2016. Edited by : Pierpaolo Basile, Anna Corazza, Franco Cutugno, Simonetta Montemagni, Malvina Nissim, Viviana Patti, Giovanni Semeraro, Rachele Sprugnoli}, author = {Alfter, David and Bizzoni, Yuri}, year = {2016}, } @inProceedings{kokkinakis-etal-2016-data-243069, title = {Data Resource Acquisition from People at Various Stages of Cognitive Decline – Design and Exploration Considerations}, abstract = {In this paper we are introducing work in progress towards the development of an infrastructure (i.e., design, methodology, creation and description) of linguistic and extra-linguistic data samples acquired from people diagnosed with subjective or mild cognitive impairment and healthy, age-matched controls. The data we are currently collecting consists of various types of modalities; i.e. audio-recorded spoken language samples; transcripts of the audio recordings (text) and eye tracking measurements. The integration of the extra-linguistic information with the linguistic phenotypes and measurements elicited from audio and text, will be used to extract, evaluate and model features to be used in machine learning experiments. In these experiments, classification models that will be trained, that will be able to learn from the whole or a subset of the data to make predictions on new data in order to test how well a differentiation between the aforementioned groups can be made. Features will be also correlated with measured outcomes from e.g. language-related scores, such as word fluency, in order to investigate whether there are relationships between various variables.}, booktitle = {The Seventh International Workshop on Health Text Mining and Information Analysis (Louhi). November 5, 2016, Austin, Texas, USA}, author = {Kokkinakis, Dimitrios and Lundholm Fors, Kristina and Nordlund, Arto}, year = {2016}, } @misc{kokkinakis-2016-proceedings-252412, title = {Proceedings of LREC 2016 Workshop: Resources and Processing of Linguistic and Extra-Linguistic Data from People with Various Forms of Cognitive/Psychiatric Impairments (RaPID-2016), Monday 23rd of May 2016. Linköping electronic conference proceedings.}, abstract = {The purpose of the Workshop on “Resources and ProcessIng of linguistic and extra-linguistic Data from people with various forms of cognitive/psychiatric impairments” (RaPID-2016) was to provide a snapshot view of some of the current technological landscape, resources, data samples and also needs and challenges in the area of processing various data from individuals with various types of mental and neurological health impairments and similar conditions at various stages; increase the knowledge, understanding, awareness and ability to achieve useful outcomes in this area and strengthen the collaboration between researchers and workers in the field of clinical/nursing/medical sciences and those in the field of language technology/computational linguistics/Natural Language Processing (NLP). Although many of the causes of cognitive and neuropsychiatric impairments are difficult to foresee and accurately predict, physicians and clinicians work with a wide range of factors that potentially contribute to such impairments, e.g., traumatic brain injuries, genetic predispositions, side effects of medication, and congenital anomalies. In this context, there is new evidence that the acquisition and processing of linguistic data (e.g., spontaneous story telling) and extra-linguistic and production measures (e.g., eye tracking) could be used as a complement to clinical diagnosis and provide the foundation for future development of objective criteria to be used for identifying progressive decline or degeneration of normal mental and brain functioning. An important new area of research in NLP emphasizes the processing, analysis, and interpretation of such data and current research in this field, based on linguistic-oriented analysis of text and speech produced by such a population and compared to healthy adults, has shown promising outcomes. This is manifested in early diagnosis and prediction of individuals at risk, the differentiation of individuals with various degrees of severity forms of brain and mental illness, and for the monitoring of the progression of such conditions through the diachronic analysis of language samples or other extralinguistic measurements. Initially, work was based on written data but there is a rapidly growing body of research based on spoken samples and other modalities. Nevertheless, there remains significant work to be done to arrive at more accurate estimates for prediction purposes in the future and more research is required in order to reliably complement the battery of medical and clinical examinations currently undertaken for the early diagnosis or monitoring of, e.g., neurodegenerative and other brain and mental disorders and accordingly, aid the development of new, non-invasive, time and cost-effective and objective (future) clinical tests in neurology, psychology, and psychiatry.}, author = {Kokkinakis, Dimitrios}, year = {2016}, publisher = {Linköping University Electronic Press}, address = {Linköping}, ISBN = {978-91-7685-730-4}, } @inProceedings{adouane-etal-2016-asirem-246853, title = {ASIREM Participation at the Discriminating Similar Languages Shared Task 2016}, booktitle = {Proceedings of the Third Workshop on NLP for Similar Languages, Varieties and Dialects; 163–169; December 12; Osaka, Japan}, author = {Adouane, Wafia and Semmar, Nasredine and Johansson, Richard}, year = {2016}, } @inProceedings{lindh-etal-2016-comparison-242808, title = {Comparison of Perceptual and ASR Results on the SweEval2016 Corpus}, booktitle = {Proceedings of IAFPA25. 25th Annual Conference of the International Association for Forensic Phonetics and Acoustics. York, UK 24th – 27th July 2016.}, author = {Lindh, Jonas and Åkesson, Joel and Sundqvist, Maria}, year = {2016}, pages = {110--111}, } @book{rosen-2016-theory-231969, title = {Theory Exploration and Inductive Theorem Proving}, abstract = {We have built two state-of-the-art inductive theorem provers named HipSpec and Hipster. The main issue when automating proofs by induction is to discover essential helper lemmas. Our theorem provers use the technique theory exploration, which is a method to systematically discover interesting conclusions about a mathematical theory. We use the existing theory exploration system QuickSpec which conjectures properties for a program that seem to hold based on testing. The idea is to try to prove these explored conjectures together with the user-stated goal conjecture. By using this idea and connecting it with our previous work on Hip, the Haskell Inductive Prover, we were able to take new leaps in field of inductive theorem proving. Additionally, we have developed a benchmark suite named TIP, short for Tons of Inductive Problems, with benchmark problems for inductive theorem provers, and a tool box for converting and manipulating problems expressed in the TIP format. There were two main reasons to this initiative. Firstly, the inductive theorem proving field lacked a shared benchmark suite as well as a format. Secondly, the benchmarks that have been used were outdated: all contemporary provers would solve almost every problem. We have so far added hundreds of new challenges to the TIP suite to encourage further research. }, author = {Rosén, Dan}, year = {2016}, publisher = {Chalmers University of Technology}, address = {Göteborg}, } @article{sundqvist-etal-2016-syllable-227628, title = {Syllable Repetition vs. Finger Tapping: Aspects of Motor Timing in 100 Healthy Adults.}, abstract = {In this study we systematically compared syllable repetition and finger tapping in healthy adults, and explored possible impacts of tempi, metronome, musical experience, and age on motor timing ability. One hundred healthy adults used finger-tapping and syllable repetition to perform an isochronous pulse in three different tempi, with and without a metronome. Results showed that the motor timing was more accurate with finger tapping than with syllable repetition in the slowest tempo, and the motor timing ability was better with the metronome than without. Persons with musical experience showed better motor timing accuracy than persons without such experience, and the timing asynchrony increased with increasing age. The slowest tempo 90 bpm posed extra challenges to the participants. We speculate that this pattern reflects the fact that the slow tempo lies outside the 3-8 Hz syllable rate of natural speech, which in turn has been linked to theta-based oscillations in the brain.}, journal = {Motor control}, author = {Sundqvist, Maria and Åsberg Johnels, Jakob and Lindh, Jonas and Laakso, Katja and Hartelius, Lena}, year = {2016}, volume = {20}, number = {3}, pages = {233--54}, } @inProceedings{volodina-etal-2016-friend-248093, title = {A Friend in Need? Research agenda for electronic Second Language infrastructure.}, abstract = {In this article, we describe the research and societal needs as well as ongoing efforts to shape Swedish as a Second Language (L2) infrastructure. Our aim is to develop an electronic research infrastructure that would stimulate empiric research into learners’ language development by preparing data and developing language technology methods and algorithms that can successfully deal with deviations in the learner language.}, booktitle = {Proceedings of the Swedish Language Technology Conference}, author = {Volodina, Elena and Megyesi, Beata and Wirén, Mats and Granstedt, Lena and Prentice, Julia and Reichenberg, Monica and Sundberg, Gunlög }, year = {2016}, publisher = {Umeå Universitet}, } @inProceedings{alfter-etal-2016-from-246345, title = {From Distributions to Labels: A Lexical Proficiency Analysis using Learner Corpora}, abstract = {In this work we look at how information from second language learner essay corpora can be used for the evaluation of unseen learner essays. Using a corpus of learner essays which have been graded by well-trained human assessors using the CEFR scale, we extract a list of word distributions over CEFR levels. For the analysis of unseen essays, we want to map each word to a so-called target CEFR level using this word list. However, the task of mapping from a distribution to a single label is not trivial. We are also investigating how we can evaluate the mapping from distribution to label. We show that the distributional profile of words from the essays, informed with the essays’ levels, consistently overlaps with our frequency-based method, in the sense that words holding the same level of proficiency as predicted by our mapping tend to cluster together in a semantic space. In the absence of a gold standard, this information can be useful to see how often a word is associated with the same level in two different models. Also, in this case we have a similarity measure that can show which words are more central to a given level and which words are more peripheral. }, booktitle = {Linköping Electronic Conference Proceedings}, author = {Alfter, David and Bizzoni, Yuri and Agebjörn, Anders and Volodina, Elena and Pilán, Ildikó}, year = {2016}, publisher = {Linköping University Electronic Press}, ISBN = {978-91-7685-633-8}, } @article{themistocleous-logotheti-2016-standard-239899, title = {Standard Modern Greek and Cypriot Greek vowels: a sociophonetic study}, abstract = {This study is a comparative analysis of Standard Modern Greek (SMG) and Cypriot Greek (CG) vowels. Specifically, the study examines the effects of vowel (/e i a o u/), language variety (SMG vs CG), and stress (stressed vs unstressed vowels) on vowel formants F1 and F2, vowel duration, and fundamental frequency (f0). 45 female speakers were recorded: 20 SMG speakers and 25 CG speakers from Athens and Nicosia respectively. The results showed significant effects of vowel, stress, and language variety on formants, duration and f0. The study confirms the findings of earlier studies on SMG vowels, provides the first report on CG vowels’ acoustic structure, and constitutes the first comparative sociophonetic research on SMG and CG vowels. }, journal = {Proceedings of the international conference on Modern Greek dialects and Linguistic Theory, Patras, 25-28 September 2014}, author = {Themistocleous, Charalambos and Logotheti, Angeliki}, year = {2016}, volume = {6}, number = {1}, pages = {178--184}, } @inProceedings{adouane-etal-2016-arabicized-252492, title = {Arabicized and Romanized Berber Automatic Identification}, abstract = {We present an automatic language identification tool for both Arabicized Berber (Berber written in the Arabic script) and Romanized Berber (Berber written in the Latin script). The focus is on short texts (social media content). We use supervised machine learning method with character and word-based n-gram models as features. We also describe the corpora used in this paper. For both Arabicized and Romanized Berber, character-based 5-grams score the best giving an F-score of 99.50%.}, booktitle = {Proceedings of TICAM 2016}, author = {Adouane, Wafia and Semmar, Nasredine and Johansson, Richard}, year = {2016}, publisher = {IRCAM}, address = {Morocco}, } @inProceedings{pilan-volodina-2016-classification-248099, title = {Classification of Language Proficiency Levels in Swedish Learners' Texts}, abstract = {We evaluate a system for the automatic classification of texts written by learners of Swedish as a second language into levels of language proficiency. Since the amount of available annotated learner essay data for our target language is rather small, we explore also the potentials of domain adaptation for this task. The additional domain consists of coursebook texts written by experts for learners. We find that already with a smaller amount of in-domain Swedish learner essay data it is possible to obtain results that compare well to state-of-the-art systems for other languages, with domain adaptation methods yielding a slight improvement.}, booktitle = {The Sixth Swedish Language Technology Conference (SLTC), Umeå University, 17-18 November, 2016}, author = {Pilán, Ildikó and Volodina, Elena}, year = {2016}, } @inProceedings{lindstromtiedemann-volodina-2016-larka-248119, title = {Lärka som didaktiskt verktyg. Undersökning om studenternas metaspråkliga kunskap.}, booktitle = {Svenskans Beskrivning 35, 11-13 maj 2016, Göteborg}, author = { Lindström Tiedemann, Therese and Volodina, Elena}, year = {2016}, } @inProceedings{volodina-pilan-2016-svalex-248116, title = {SVALex: en andraspråksordlista graderad enligt CEFR nivåer.}, booktitle = {Svenskans Beskrivning 35, Göteborg 2016}, author = {Volodina, Elena and Pilán, Ildikó}, year = {2016}, } @inProceedings{volodina-etal-2016-swell-248145, title = {SweLL – en korpus med L2 uppsatser för CEFR studier.}, booktitle = {Svenskans Beskrivning 35, Göteborg 2016}, author = {Volodina, Elena and Pilán, Ildikó and Enström, Ingegerd and Lundkvist, Peter and Sundberg, Gunlög and Llozhi, Lorena and Sandell, Monica}, year = {2016}, } @inProceedings{volodina-etal-2016-swellex-248090, title = {SweLLex: second language learners' productive vocabulary.}, abstract = {This paper presents a new lexical resource for learners of Swedish as a second language, SweLLex, and a know-how behind its creation. We concentrate on L2 learners’ productive vocabulary, i.e. words that they are actively able to produce, rather than the lexica they comprehend (receptive vocabulary). The proposed list covers productive vocabulary used by L2 learners in their essays. Each lexical item on the list is connected to its frequency distribution over the six levels of proficiency defined by the Common European Framework of Reference (CEFR) (Council of Europe, 2001}. To make this list a more reliable resource, we experiment with normalizing L2 word-level errors by replacing them with their correct equivalents. SweLLex has been tested in a prototype system for automatic CEFR level classification of essays as well as in a visualization tool aimed at exploring L2 vocabulary contrasting receptive and productive vocabulary usage at different levels of language proficiency.}, booktitle = {Linköping Electronic Conference Proceedings. Proceedings of the joint workshop on NLP for Computer Assisted Language Learning and NLP for Language Acquisition at SLTC, Umeå, 16th November 2016}, author = {Volodina, Elena and Pilán, Ildikó and Llozhi, Lorena and Degryse, Baptiste and François, Thomas }, year = {2016}, publisher = {Linköping University Electronic Press}, address = {Linköping}, ISBN = {978-91-7685-633-8}, } @inProceedings{alfter-volodina-2016-modeling-246347, title = {Modeling Individual Learner Knowledge in a Computer Assisted Language Learning System}, booktitle = {Proceedings of the Sixth Swedish Language Technology Conference. Umeå University, 17-18 November, 2016}, author = {Alfter, David and Volodina, Elena}, year = {2016}, } @inProceedings{volodina-etal-2016-classification-246346, title = {Classification of Swedish learner essays by CEFR levels}, abstract = {The paper describes initial efforts on creating a system for the automatic assessment of Swedish second language (L2) learner essays from two points of view: holistic evaluation of the reached level according to the Common European Framework of Reference (CEFR), and the lexical analysis of texts for receptive and productive vocabulary per CEFR level. We describe the data and resources that our experiments were based on, provide a short introduction to the algorithm for essay classification and experiment results, present the user interface we developed for testing new essays and outline future work. }, booktitle = {Proceedings of EuroCALL 2016. 24-27th August 2016, Cyprus.}, author = {Volodina, Elena and Pilán, Ildikó and Alfter, David}, year = {2016}, publisher = {Research-publishing.net}, ISBN = { 978-1-908416-44-5}, } @inProceedings{nusko-etal-2016-building-238135, title = {Building a Sentiment Lexicon for Swedish}, abstract = {In this paper we will present our ongoing project to build and evaluate a sentiment lexicon for Swedish. Our main resource is SALDO, a lexical resource of modern Swedish developed at Språkbanken, University of Gothenburg. Using a semi-supervised approach, we expand a manually chosen set of six core words using parent-child relations based on the semantic network structure of SALDO. At its current stage the lexicon consists of 175 seeds, 633 children, and 1319 grandchildren.}, booktitle = {Linköping Electronic Conference Proceedings}, author = {Nusko, Bianka and Tahmasebi, Nina and Mogren, Olof}, year = {2016}, volume = {126}, number = {006}, ISBN = {978-91-7685-733-5}, pages = {32----37}, } @inProceedings{pilan-etal-2016-coursebook-246349, title = {Coursebook texts as a helping hand for classifying linguistic complexity in language learners' writings}, abstract = {We bring together knowledge from two different types of language learning data, texts learners read and texts they write, to improve linguistic complexity classification in the latter. Linguistic complexity in the foreign and second language learning context can be expressed in terms of proficiency levels. We show that incorporating features capturing lexical complexity information from reading passages can boost significantly the machine learning based classification of learner-written texts into proficiency levels. With an F1 score of .8 our system rivals state-of-the-art results reported for other languages for this task. Finally, we present a freely available web-based tool for proficiency level classification and lexical complexity visualization for both learner writings and reading texts. }, booktitle = {Proceedings of the workshop on Computational Linguistics for Linguistic Complexity}, author = {Pilán, Ildikó and Alfter, David and Volodina, Elena}, year = {2016}, ISBN = {978-4-87974-709-9}, } @inProceedings{alfter-2016-learning-241664, title = {Learning the Learner: User Modeling in Intelligent Computer Assisted Language Learning Systems}, booktitle = {CEUR Workshop Proceedings, v.1618. UMAP 2016 Extended Proceedings. Halifax, Canada, July 13-16, 2016. Edited by : Federica Cena, Michel Desmarais, Darina Dicheva, Jie Zhang}, author = {Alfter, David}, year = {2016}, } @inProceedings{adouane-johansson-2016-gulf-242243, title = {Gulf Arabic Resource Building for Sentiment Analysis}, abstract = {This paper deals with building linguistic resources for Gulf Arabic, one of the Arabic variations, for sentiment analysis task using machine learning. To our knowledge, no previous works were done for Gulf Arabic sentiment analysis despite the fact that it is present in different online platforms. Hence, the first challenge is the absence of annotated data and sentiment lexicons. To fill this gap, we created these two main linguistic resources. Then we conducted different experiments: use Naive Bayes classifier without any lexicon; add a sentiment lexicon designed basically for MSA; use only the compiled Gulf Arabic sentiment lexicon and finally use both MSA and Gulf Arabic sentiment lexicons. The Gulf Arabic lexicon gives a good improvement of the classifier accuracy (90.54 %) over a baseline that does not use the lexicon (82.81%), while the MSA lexicon causes the accuracy to drop to (76.83%). Moreover, mixing MSA and Gulf Arabic lexicons causes the accuracy to drop to (84.94%) compared to using only Gulf Arabic lexicon. This indicates that it is useless to use MSA resources to deal with Gulf Arabic due to the considerable differences and conflicting structures between these two languages.}, booktitle = {Proceedings of the Language Resources and Evaluation Conference (LREC), 23-28 May 2016, Portorož, Slovenia}, author = {Adouane, Wafia and Johansson, Richard}, year = {2016}, publisher = {European Language Resources Association}, ISBN = {978-2-9517408-9-1}, } @article{rehm-etal-2016-strategic-237609, title = {The strategic impact of META-NET on the regional, national and international level}, abstract = {This article provides an overview of the dissemination work carried out in META-NET from 2010 until 2015; we describe its impact on the regional, national and international level, mainly with regard to politics and the funding situation for LT topics. The article documents the initiative’s work throughout Europe in order to boost progress and innovation in our field.}, journal = {Language resources and evaluation}, author = {Rehm, Georg and Uszkoreit, Hans and Ananiadou, Sophia and Bel, Núria and Bielevičienė, Audronė and Borin, Lars and Branco, António and Budin, Gerhard and Calzolari, Nicoletta and Daelemans, Walter and Garabík, Radovan and Grobelnik, Marko and García-Mateo, Carmen and Genabith, Josef Van and Hajič, Jan and Hernáez, Inma and Judge, John and Koeva, Svetla and Krek, Simon and Krstev, Cvetana and Lindén, Krister and Magnini, Bernardo and Mariani, Joseph and Mcnaught, John and Melero, Maite and Monachini, Monica and Moreno, Asunción and Odijk, Jan and Ogrodniczuk, Maciej and Pęzik, Piotr and Piperidis, Stelios and Przepiórkowski, Adam and Rögnvaldsson, Eiríkur and Rosner, Mike and Pedersen, Bolette Sandford and Skadiņa, Inguna and De Smedt, Koenraad and Tadić, Marko and Thompson, Paul and Tufiş, Dan and Váradi, Tamás and Vasiļjevs, Andrejs and Vider, Kadri and Zabarskaitė, Jolanta}, year = {2016}, volume = {50}, number = {2}, pages = {351--374}, } @article{pilan-etal-2016-readable-226565, title = {A readable read: Automatic Assessment of Language Learning Materials based on Linguistic Complexity.}, abstract = {Corpora and web texts can become a rich language learning resource if we have a means of assessing whether they are linguistically appropriate for learners at a given proficiency level. In this paper, we aim at addressing this issue by presenting the first approach for predicting linguistic complexity for Swedish second language learning material on a 5-point scale. After showing that the traditional Swedish readability measure, Läsbarhetsindex (LIX), is not suitable for this task, we propose a supervised machine learning model, based on a range of linguistic features, that can reliably classify texts according to their difficulty level.Our model obtained an accuracy of 81.3% and an F-score of 0.8, which is comparable to the state of the art in English and is considerably higher than previously reported results for other languages. We further studied the utility of our features with single sentences instead of full texts since sentences are a common linguistic unit in language learning exercises. We trained a separate model on sentence-level data with five classes, which yielded 63.4% accuracy. Although this is lower than the document level performance, we achieved an adjacent accuracy of 92%. Furthermore, we found that using a combination of different features, compared to using lexical features alone, resulted in 7% improvement in classification accuracy at the sentence level, whereas at the document level, lexical features were more dominant. Our models are intended for use in a freely accessible web-based language learning platform for the automatic generation of exercises, and they will be available also in the form of web-services.}, journal = {Computational Linguistics and Applications}, author = {Pilán, Ildikó and Vajjala, Sowmya and Volodina, Elena}, year = {2016}, volume = {7}, number = {1}, pages = {143--159}, } @article{themistocleous-2016-seeking-239901, title = {Seeking an Anchorage. Stability and Variability in Tonal Alignment of Rising Prenuclear Pitch Accents in Cypriot Greek}, abstract = {Although tonal alignment constitutes a quintessential property of pitch accents, its exact characteristics remain unclear. This study, by exploring the timing of the Cypriot Greek L*+H prenuclear pitch accent, examines the predictions of three hypotheses about tonal alignment: the invariance hypothesis, the segmental anchoring hypothesis, and the segmental anchorage hypothesis. The study reports on two experiments: the first of which manipulates the syllable patterns of the stressed syllable, and the second of which modifies the distance of the L*+H from the following pitch accent. The findings on the alignment of the low tone (L) are illustrative of the segmental anchoring hypothesis predictions: the L persistently aligns inside the onset consonant, a few milliseconds before the stressed vowel. However, the findings on the alignment of the high tone (H) are both intriguing and unexpected: the alignment of the H depends on the number of unstressed syllables that follow the prenuclear pitch accent. The ‘wandering’ of the H over multiple syllables is extremely rare among languages, and casts doubt on the invariance hypothesis and the segmental anchoring hypothesis, as well as indicating the need for a modified version of the segmental anchorage hypothesis. To address the alignment of the H, we suggest that it aligns within a segmental anchorage–the area that follows the prenuclear pitch accent–in such a way as to protect the paradigmatic contrast between the L*+H prenuclear pitch accent and the L+H* nuclear pitch accent.}, journal = {Language and Speech}, author = {Themistocleous, Charalambos}, year = {2016}, volume = {59}, number = {4}, pages = {433--461}, } @inProceedings{adouane-etal-2016-automatic-246765, title = {Automatic Detection of Arabicized Berber and Arabic Varieties}, abstract = {Automatic Language Identification (ALI) is the detection of the natural language of an input text by a machine. It is the first necessary step to do any language-dependent natural language processing task. Various methods have been successfully applied to a wide range of languages, and the state-of-the-art automatic language identifiers are mainly based on character n-gram models trained on huge corpora. However, there are many languages which are not yet automatically processed, for instance minority and informal languages. Many of these languages are only spoken and do not exist in a written format. Social media platforms and new technologies have facilitated the emergence of written format for these spoken languages based on pronunciation. The latter are not well represented on the Web, commonly referred to as under-resourced languages, and the current available ALI tools fail to properly recognize them. In this paper, we revisit the problem of ALI with the focus on Arabicized Berber and dialectal Arabic short texts. We introduce new resources and evaluate the existing methods. The results show that machine learning models combined with lexicons are well suited for detecting Arabicized Berber and different Arabic varieties and distinguishing between them, giving a macro-average F-score of 92.94%.}, booktitle = {Proceedings of the Third Workshop on NLP for Similar Languages, Varieties and Dialects; 63–72; December 12; Osaka, Japan}, author = {Adouane, Wafia and Semmar, Nasredine and Johansson, Richard and Bobicev, Victoria}, year = {2016}, } @techreport{borin-etal-2016-free-233768, title = {A free cloud service for OCR / En fri molntjänst för OCR}, author = {Borin, Lars and Bouma, Gerlof and Dannélls, Dana}, year = {2016}, publisher = {University of Gothenburg}, address = {Göteborg}, } @techreport{wilhelmsson-2016-huvudansatser-247442, title = {Huvudansatser för parsningsmetoder. Om programutvecklingens förutsättningar i en svensk kontext}, abstract = {Syftet med denna text var att ge en inblick i området (syntaktisk) parsning. Tanken var att ge en bild av utvecklingen som var 1) fri från alltför tekniska detaljer, då området är programmeringstekniskt, och 2) beskriven ur ett svenskt perspektiv. Bakgrunden till valet av ämne till texten, som var tänkt att finnas med i antologin Text och kontext, var att parsning är relativt okänt för många personer verksamma inom närliggande områden, samtidigt som det är ett absolut nyckelbegrepp för den som ägnar sig åt datorlingvistik eller språkteknologi. Målet var alltså att ge en ganska allmän utifrånblick på några centrala sidor av utvecklingen, samtidigt som det tydligt är så att den som själv arbetat med utveckling kan ha starka åsikter och preferenser rörande metodval, något som i ärlighetens namn kanske inte heller denna text är lösgjord från. Hur ska det göras? Konsten att utveckla automatisk syntaxanalys av naturlig text kan läras ut från ett flertal perspektiv. Det kan t.ex. ske med fokus på användandet av en viss grammatikformalism, med fokus på beräkningssnabbhet, med fokus på entydiggörande av möjliga ambiguiteter. Tolkningsval kan göras med hjälp av antingen handskrivna regler eller inhämtad statistik. En sorts huvudtema i denna text är hur metoder för parsning på senare år uppvisar förändringar som kanske kan förklaras med att programmen har fått andra användningsområden och att metoderna har anpassats därefter (en annan tolkning är att flera senare system inte längre gör parsning i strikt mening). När detta tänkta ”kapitel” var färdigt fick det kommentaren att det inte var anpassat för antologins målgrupp. Det fick skrivas en annan kapiteltext, men det kom samtidigt ett förslag att publicera texten om parsning här som denna rapport.}, author = {Wilhelmsson, Kenneth}, year = {2016}, publisher = {Göteborgs universitet}, address = {Göteborg}, } @article{adesam-etal-2016-sprakteknologi-237884, title = {Språkteknologi för svenska språket genom tiderna}, abstract = {Språkbanken, the Swedish Language Bank, is a language technology research unit at the Department of Swedish, University of Gothenburg. We develop language resources – such as corpora, lexical resources, and analytical tools – for all variants of Swedish, from Old Swedish laws to present-day social media. Historical texts offer exciting theoretical and methodological challenges for language technology because they often defy the assumption inherent in most automatic analysis tools that the texts contain a standardized written language. In this article, we describe our ongoing work on the development of annotated historical corpora, as well as our efforts on linking various resources (both corpora and lexical resources). This research advances the state of the art of language technology as well as enables new research for scholars in other disciplines.}, journal = {Kungliga Skytteanska Samfundets Handlingar}, author = {Adesam, Yvonne and Ahlberg, Malin and Andersson, Peter and Borin, Lars and Bouma, Gerlof and Forsberg, Markus}, year = {2016}, volume = {76}, number = {Studier i svensk språkhistoria 13}, pages = {65--87}, } @incollection{forsberg-hulden-2016-learning-240208, title = {Learning Transducer Models for Morphological Analysis from Example Inflections}, abstract = {In this paper, we present a method to convert morphological inflection tables into unweighted and weighted finite transducers that perform parsing and generation. These transducers model the inflectional behavior of morphological paradigms induced from examples and can map inflected forms of previously unseen word forms into their lemmas and give morphosyntactic descriptions of them. The system is evaluated on several languages with data collected from the Wiktionary.}, booktitle = {Proceedings of the SIGFSM Workshop on Statistical NLP and Weighted Automata. Association for Computational Linguistics. August 12, 2016 Berlin, Germany}, author = {Forsberg, Markus and Hulden, Mans}, year = {2016}, publisher = {ACL}, address = {Stroudsburg, PA, USA}, ISBN = {978-1-945626-13-5 }, pages = {42--50}, } @inProceedings{cap-etal-2016-sword-254388, title = {SWORD: Towards Cutting-Edge Swedish Word Processing}, abstract = {Despite many years of research on Swedish language technology, there is still no well-documented standard for Swedish word processing covering the whole spectrum from low-level tokenization to morphological analysis and disambiguation. SWORD is a new initiative within the SWE-CLARIN consortium aiming to develop documented standards for Swedish word processing. In this paper, we report on a pilot study of Swedish tokenization, where we compare the output of six different tokenizers on four different text types. For one text type (Wikipedia articles), we also compare to the tokenization produced by six manual annotators.}, booktitle = {Proceedings of the Sixth Swedish Language Technology Conference (SLTC) Umeå University, 17-18 November, 2016}, author = {Cap, Fabienne and Adesam, Yvonne and Ahrenberg, Lars and Borin, Lars and Bouma, Gerlof and Forsberg, Markus and Kann, Viggo and Östling, Robert and Smith, Aaron and Wirén, Mats and Nivre, Joakim}, year = {2016}, } @inProceedings{viklund-borin-2016-data-236738, title = {How can big data help us study rhetorical history?}, abstract = {Rhetorical history is traditionally studied through rhetorical treatises or selected rhetorical practices, for example the speeches of major orators. Although valuable sources, these do not give us the answers to all our questions. Indeed, focus on a few canonical works or the major historical key figures might even lead us to reproduce cultural self-identifications and false generalizations. However, thanks to increasing availability of relevant digitized texts, we are now at a point where it is possible to see how new research questions can be formulated – and how old research questions can be addressed from a new angle or established results verified – on the basis of exhaustive collections of data, rather than small samples, but where a methodology has not yet established itself. The aim of this paper is twofold: (1) We wish to demonstrate the usefulness of large-scale corpus studies (“text mining”) in the field of rhetorical history, and hopefully point to some interesting research problems and how they can be analyzed using “big-data” methods. (2) In doing this, we also aim to make a contribution to method development in e-science for the humanities and social sciences, and in particular in the framework of CLARIN. }, booktitle = {Linköping Electronic Conference Proceedings, No. 123. Edited by Koenraad De Smedt. Selected Papers from the CLARIN Annual Conference 2015. October 14–16, 2015, Wroclaw, Poland}, author = {Viklund, Jon and Borin, Lars}, year = {2016}, volume = {123}, ISBN = {978-91-7685-765-6}, pages = {79--93}, } @inProceedings{ehrlemark-etal-2016-retrieving-242241, title = {Retrieving Occurrences of Grammatical Constructions}, abstract = {Finding authentic examples of grammatical constructions is central in constructionist approaches to linguistics, language processing, and second language learning. In this paper, we address this problem as an information retrieval (IR) task. To facilitate research in this area, we built a benchmark collection by annotating the occurrences of six constructions in a Swedish corpus. Furthermore, we implemented a simple and flexible retrieval system for finding construction occurrences, in which the user specifies a ranking function using lexical-semantic similarities (lexicon-based or distributional). The system was evaluated using standard IR metrics on the new benchmark, and we saw that lexical-semantical rerankers improve significantly over a purely surface-oriented system, but must be carefully tailored for each individual construction. }, booktitle = {Proceedings of COLING 2016, the 26th International Conference on Computational Linguistics : Technical Papers, December 11–17; Osaka, Japan}, author = {Ehrlemark, Anna and Johansson, Richard and Lyngfelt, Benjamin}, year = {2016}, ISBN = {978-4-87974-702-0}, } @inProceedings{gruzitis-etal-2016-grammatical-233921, title = {Grammatical Framework for implementing multilingual frames and constructions}, booktitle = {Book of Abstracts. The 9th International Conference on Construction Grammar (ICCG9) theme session on Computational Semantics with Frames and Constructions. October 05-09, 2016, Juiz de Fora, Brazil }, author = {Gruzitis, Normunds and Dannélls, Dana and Ranta, Aarne and Tyers, Francis M.}, year = {2016}, } @inProceedings{themistocleous-etal-2016-effects-239893, title = {Effects of stress on fricatives: Evidence from Standard Modern Greek}, abstract = {This study investigates the effects of stress on the spectral properties of fricative noise in Standard Modern Greek (SMG). Twenty female speakers of SMG participated in the study. Fricatives were produced in stressed and unstressed positions in two vowel place positions: back and front vowels. Acoustic measurements were taken and the temporal and spectral properties of fricatives using spectral moments were calculated. Stressed fricatives are produced with increased duration, center of gravity, standard deviation, and normalized intensity. The machine learning and classification algorithm C5.0 has been employed to estimate the contribution of the temporal and spectral parameters for the classification of fricatives. Overall, duration and center of gravity contribute the most to the classification of stressed vs. unstressed fricatives.}, booktitle = {17th Annual Conference of the International Speech Communication Association, Interspeech 2016 8-12 Sep 2016, San Francisco, USA }, author = {Themistocleous, Charalambos and Savva, Angelandria and Aristodemou, Andrie}, year = {2016}, ISBN = {978-1-5108-3313-5}, } @article{themistocleous-2016-bursts-243451, title = {The bursts of stops can convey dialectal information}, abstract = {This study investigates the effects of the dialect of the speaker on the spectral properties of stop bursts. Forty-five female speakers—20 Standard Modern Greek and 25 Cypriot Greek speakers—participated in this study. The spectral properties of stop bursts were calculated from the burst spectra and analyzed using spectral moments. The findings show that besides linguistic information, i.e., the place of articulation and the stress, the speech signals of bursts can encode social information, i.e., the dialects. A classification model using decision trees showed that skewness and standard deviation have a major contribution for the classification of bursts across dialects.}, journal = {Journal of the Acoustical Society of America}, author = {Themistocleous, Charalambos}, year = {2016}, volume = {140}, number = {4}, pages = {EL334--EL339}, }