Språkbanken Text is a part of Språkbanken.
BibTeX

@inProceedings{berdicevskis-etal-2023-superlim-331445,
	title        = {Superlim: A Swedish Language Understanding Evaluation Benchmark},
	booktitle    = {Proceedings of the 2023 Conference on Empirical Methods in Natural Language Processing, December 6-10, 2023, Singapore  / Houda Bouamor, Juan Pino, Kalika Bali (Editors)},
	author       = {Berdicevskis, Aleksandrs and Bouma, Gerlof and Kurtz, Robin and Morger, Felix and Öhman, Joey and Adesam, Yvonne and Borin, Lars and Dannélls, Dana and Forsberg, Markus and Isbister, Tim and Lindahl, Anna and Malmsten, Martin and Rekathati, Faton and Sahlgren, Magnus and Volodina, Elena and Börjeson, Love and Hengchen, Simon and Tahmasebi, Nina},
	year         = {2023},
	publisher    = {Association for Computational Linguistics},
	address      = {Stroudsburg, PA},
	ISBN         = {979-8-89176-060-8},
	pages        = {8137--8153},
}

@inProceedings{dannells-etal-2024-transformer-338708,
	title        = {Transformer-based Swedish Semantic Role Labeling through Transfer Learning},
	abstract     = {Semantic Role Labeling (SRL) is a task in natural language understanding where the goal is to extract semantic roles for a given sentence. English SRL has achieved state-of-the-art performance using Transformer techniques and supervised learning. However, this technique is not a viable choice for smaller languages like Swedish due to the limited amount of training data. In this paper, we present the first effort in building a Transformer-based SRL system for Swedish by exploring multilingual and cross-lingual transfer learning methods and leveraging the Swedish FrameNet resource. We demonstrate that multilingual transfer learning outperforms two different cross-lingual transfer models. We also found some differences between frames in FrameNet that can either hinder or enhance the model’s performance. The resulting end-to-end model is freely available and will be made accessible through Språkbanken Text’s research infrastructure.},
	booktitle    = {Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024), 20-25 May, 2024, Torino, Italia},
	author       = {Dannélls, Dana and Johansson, Richard and Buhr, Lucy Yang},
	year         = {2024},
	publisher    = {ELRA and ICCL},
	address      = {Turin, Italy},
	ISBN         = {978-2-493814-10-4},
}

@inProceedings{lofgren-dannells-2024-post-336065,
	title        = {Post-OCR Correction of Digitized Swedish Newspapers with ByT5},
	abstract     = {Many collections of digitized newspapers suffer from poor OCR quality, which impacts readability, information retrieval, and analysis of the material. Errors in OCR output can be reduced by applying machine translation models to translate it into a corrected version. Although transformer models show promising results in post-OCR correction and related tasks in other languages, they have not yet been explored for correcting OCR errors in Swedish texts. This paper presents a post-OCR correction model for Swedish 19th to 21th century newspapers based on the pre-trained transformer model ByT5. Three versions of the model were trained on different mixes of training data. The best model, which achieved a 36\% reduction in CER, is made freely available and will be integrated into the automatic processing pipeline of Språkbanken Text, a Swedish language technology infrastructure containing modern and historical written data.},
	booktitle    = {Proceedings of the 8th Joint SIGHUM Workshop on Computational Linguistics for Cultural Heritage, Social Sciences, Humanities and Literature (LaTeCH-CLfL 2024), March 22, 2024, Malta},
	author       = {Löfgren, Viktoria and Dannélls, Dana},
	year         = {2024},
	publisher    = {Association for Computational Linguistics},
	address      = {United States Pennsylvania East Stroudsburg},
	ISBN         = {979-8-89176-069-1},
}

@inProceedings{dannells-broden-2020-building-297061,
	title        = {Building a Language Technology Infrastructure for Digital Humanities: Challenges, Opportunities and Progress},
	abstract     = {Språkbanken Text, a research unit at the University of Gothenburg, forms part of the National Language Bank of Sweden and is the main coordinating node of Swe-Clarin, the Swedish national CLARIN node. During the past years, Språkbanken Text has been actively engaged in a number of humanities
and social sciences related research projects. This engagement has primarily concerned the development of new resources, methods and tools to accurately process large amounts of digitized material, in addition to interfaces for visualizing the materials, making them easily accessible for further analysis. The activities within Swe-Clarin have been essential for the progress and the success of this
work. In this paper we present what was required from Språkbanken Text in order to meet the expectations of researchers from the humanities and social sciences. We discuss some of the challenges this work involves and describe the opportunities this field brings with it and how these opportunities could help to progress the work of Språkbanken Text toward building a language technology infrastructure that supports interdisciplinary research.},
	booktitle    = {Proceedings of the Twin Talks 2 and 3 Workshops at DHN 2020 and DH 2020 Ottawa Canada and Riga Latvia, July 23 and October 20, 2020},
	editor       = {Steven Krauwer and Darja Fišer},
	author       = {Dannélls, Dana and Brodén, Daniel},
	year         = {2020},
	publisher    = {CEUR-WS.org},
}

@incollection{borin-etal-2021-introduction-310200,
	title        = {Introduction: 
Swedish FrameNet++},
	abstract     = {The Swedish FrameNet++ was designed to be several things. As a digital artifact, it is an integrated panchronic lexical macroresource, primarily for Swedish, but including several other languages, intended as a basic infrastructural component in Swedish language technology research and for developing natural language processing applications. As an activity, it is a long-term R&D initiative,
initially aimed at bringing about this macroresource, and now at maintaining and extending it, at promoting its use in language technology research and application development, as well as ensuring that the results of this research and development in their turn are incorporated in the macroresource. As a product of research, it reflects both computational and linguistic approaches to lexicology,
lexical semantics, and lexical typology.},
	booktitle    = {The Swedish FrameNet++. Harmonization, integration, method development and practical language technology applications / editor(s): Dana Dannélls, Lars Borin and Karin Friberg Heppin },
	author       = {Borin, Lars and Dannélls, Dana and Friberg Heppin, Karin},
	year         = {2021},
	publisher    = {John Benjamins Publishing Company},
	address      = {Amsterdam / Philadelphia},
	ISBN         = {978 90 272 5848 9},
	pages        = {3 -- 36},
}

@book{dannells-etal-2021-swedish-310036,
	title        = {The Swedish FrameNet++
Harmonization, integration, method development
and practical language technology applications},
	abstract     = {Large computational lexicons are central NLP resources. Swedish FrameNet++ aims to be a versatile full-scale lexical resource for NLP containing many kinds of linguistic information. Although focused on Swedish, this ongoing effort, which includes building a new Swedish framenet and recycling existing lexicons, has offered valuable insights into general aspects of lexical-resource building for NLP, which are discussed in this book: computational and linguistic problems of lexical semantics and lexical typology, the nature of lexical items (words and multiword expressions), achieving interoperability among heterogeneous lexical content, NLP methods for extending and interlinking existing lexicons,
and deploying the new resource in practical NLP applications. This book is targeted at everyone with an interest in lexicography, computational lexicography, lexical typology, lexical semantics, linguistics, computational linguistics and related fields. We believe it should be of particular interest to those who are or have been involved in language resource creation, development and evaluation.},
	author       = {Dannélls, Dana and Borin, Lars and Friberg Heppin, Karin},
	year         = {2021},
	publisher    = {John Benjamins Publishing Company},
	address      = {Amsterdam, Philadelphia},
	ISBN         = {9789027209900 },
}

@inProceedings{virk-etal-2021-data-306964,
	title        = {A Data-Driven Semi-Automatic Framenet Development Methodology },
	abstract     = {FrameNet is a lexical semantic resource based on the linguistic theory of frame semantics. A number of framenet development strategies have been reported previously and all of them involve exploration of corpora and a fair amount of manual work. Despite previous efforts, there does not exist
a well-thought-out automatic/semi-automatic methodology for frame construction. In this paper we propose a data-driven methodology for identification and semi-automatic construction of frames. As a proof of concept, we report on our initial attempts to build a wider-scale framenet for the legal domain (LawFN) using the proposed methodology. The constructed frames are stored in a lexical database
and together with the annotated example sentences they have been made available through a web interface.},
	booktitle    = {Proceedings of the International Conference on Recent Advances in Natural Language Processing, 1–3 September, 2021 / Edited by Galia Angelova, Maria Kunilovskaya, Ruslan Mitkov, Ivelina Nikolova-Koleva},
	author       = {Virk, Shafqat and Dannélls, Dana and Borin, Lars and Forsberg, Markus},
	year         = {2021},
	publisher    = {INCOMA},
	address      = {Shoumen, Bulgaria},
	ISBN         = {978-954-452-072-4},
}

@inProceedings{skelbye-dannells-2021-processing-306957,
	title        = {OCR Processing of Swedish Historical Newspapers Using Deep Hybrid CNN–LSTM Networks},
	abstract     = {Deep CNN–LSTM hybrid neural networks have proven to improve the accuracy of Optical Character Recognition (OCR) models for different languages. In this paper we examine to what extent these networks improve the OCR accuracy rates on Swedish historical newspapers. By experimenting
with the open source OCR engine Calamari, we are able to show that mixed deep CNN–LSTM hybrid models outperform previous models on the task of character recognition of Swedish historical newspapers spanning 1818–1848. We achieved an average character accuracy rate (CAR) of 97.43% which is a new state–of–the–art result on 19th century Swedish newspaper text. Our data, code and
models are released under CC BY licence.},
	booktitle    = {Proceedings of the International Conference on Recent Advances in Natural Language Processing, 1–3 September, 2021},
	editor       = {Galia Angelova and Maria Kunilovskaya and Ruslan Mitkov and Ivelina Nikolova-Koleva},
	author       = {Skelbye, Molly and Dannélls, Dana},
	year         = {2021},
	publisher    = {INCOMA },
	address      = {Shoumen, Bulgaria},
	ISBN         = {978-954-452-072-4},
}

@inProceedings{masciolini-etal-2023-towards-329384,
	title        = {Towards automatically extracting morphosyntactical error patterns from L1-L2 parallel dependency treebanks},
	abstract     = {L1-L2 parallel dependency treebanks are UD-annotated corpora of learner sentences paired with correction hypotheses. Automatic morphosyntactical annotation has the potential to remove the need for explicit manual error tagging and improve interoperability, but makes it more challenging to locate grammatical errors in the resulting datasets. We therefore propose a novel method for automatically extracting morphosyntactical error patterns and perform a preliminary bilingual evaluation of its first implementation through a similar example retrieval task. The resulting pipeline is also available as a prototype CALL application.},
	booktitle    = {Proceedings of the 18th Workshop on Innovative Use of NLP for Building Educational Applications (BEA 2023), July 13, 2023, Toronto, Canada},
	author       = {Masciolini, Arianna and Volodina, Elena and Dannélls, Dana},
	year         = {2023},
	publisher    = {Association for Computational Linguistics},
	address      = {Stroudsburg, PA},
	ISBN         = {978-1-959429-80-7},
}

@misc{ilinykh-etal-2023-proceedings-327035,
	title        = {Proceedings of the Second Workshop on Resources and Representations for Under-Resourced Languages and Domains (RESOURCEFUL-2023), May 22, 2023, Tórshavn, Faroe Islands},
	abstract     = {The second workshop on resources and representations for under-resourced language and domains was
held in Tórshavn, Faroe Islands on May 22nd, 2023. The workshop was conducted in a physical setting,
allowing for potential hybrid participation.
Continuing with the aim of the first edition in 2020, RESOURCEFUL explored the role of the kind and
the quality of resources that are available to us, as well as the challenges and directions for constructing
new resources in light of the latest trends in natural language processing. The workshop has provided
a forum for discussions between the two communities involved in building data-driven and annotation-
driven resources.},
	author       = {Ilinykh, Nikolai and Morger, Felix and Dannélls, Dana and Dobnik, Simon and Megyesi, Beáta and Nivre, Joakim},
	year         = {2023},
	publisher    = {Association for Computational Linguistics},
	address      = {Stroudsburg, PA },
	ISBN         = {978-1-959429-73-9},
}

@inProceedings{virk-etal-2021-novel-306962,
	title        = {A Novel Machine Learning Based Approach for Post-OCR Error Detection},
	abstract     = {Post processing is the most conventional approach for correcting errors that are caused
by Optical Character Recognition (OCR) systems. Two steps are usually taken to correct
OCR errors: detection and corrections. For the first task, supervised machine learning methods have shown state-of-the-art performances. Previously proposed approaches have focused
most prominently on combining lexical, contextual and statistical features for detecting errors. In this study, we report a novel system to error detection which is based merely on the n-gram counts of a candidate token. In addition to being simple and computationally less expensive, our proposed system beats previous systems reported in the ICDAR2019 competition on OCR-error detection with notable margins. We achieved state-of-the-art F1-scores for eight out of the ten involved European languages. The maximum improvement is for Spanish which improved from 0.69 to 0.90, and the minimum for Polish from 0.82 to 0.84. },
	booktitle    = {Proceedings of the International Conference on Recent Advances in Natural Language Processing, 1–3 September, 2021 / Edited by Galia Angelova, Maria Kunilovskaya, Ruslan Mitkov, Ivelina Nikolova-Koleva},
	author       = {Virk, Shafqat and Dannélls, Dana and Muhammad, Azam Sheikh},
	year         = {2021},
	publisher    = {INCOMA},
	address      = {Shoumen, Bulgaria},
	ISBN         = {978-954-452-072-4},
}

@article{gruzitis-dannells-2017-multilingual-225789,
	title        = {A multilingual FrameNet-based grammar and lexicon for controlled natural language},
	abstract     = {Berkeley FrameNet is a lexico-semantic resource for English based on the theory of frame semantics. It has been exploited in a range of natural language processing applications and has inspired the development of framenets for many languages. We present a methodological approach to the extraction and generation of a computational multilingual FrameNet-based grammar and lexicon. The approach leverages FrameNet-annotated corpora to automatically extract a set of cross-lingual semantico-syntactic valence patterns. Based on data from Berkeley FrameNet and Swedish FrameNet, the proposed approach has been implemented in Grammatical Framework (GF), a categorial grammar formalism specialized for multilingual grammars. The implementation of the grammar and lexicon is supported by the design of FrameNet, providing a frame semantic abstraction layer, an interlingual semantic application programming interface (API), over the interlingual syntactic API already provided by GF Resource Grammar Library. The evaluation of the acquired grammar and lexicon shows the feasibility of the approach. Additionally, we illustrate how the FrameNet-based grammar and lexicon are exploited in two distinct multilingual controlled natural language applications. The produced resources are available under an open source license.},
	journal      = {Language resources and evaluation},
	author       = {Gruzitis, Normunds and Dannélls, Dana},
	year         = {2017},
	volume       = {51},
	number       = {1},
	pages        = {37–66},
}

@inProceedings{adesam-etal-2018-exploring-273835,
	title        = {Exploring the Quality of the Digital Historical Newspaper Archive KubHist},
	booktitle    = {Seventh Swedish Language Technology Conference (SLTC), Stockholm, 7-9 November 2018},
	author       = {Adesam, Yvonne and Dannélls, Dana and Tahmasebi, Nina},
	year         = {2018},
}

@incollection{dannells-etal-2022-beyond-321730,
	title        = {Beyond strings of characters: Resources meet NLP – Again},
	abstract     = {FrameNet (FN) resources have existed for many languages for over a decade but their adoption
in real world applications has been limited. To celebrate the 65 anniversary of Lars Borin, the
initiator and leader of Swedish FrameNet, among others, we take a standpoint to motivate why
language resources are crucial for moving NLP forward. We present our position on (a) the need
for language resources to embrace other dimensions of text and language use, and (b) the need
for them to relate to other representations through multimodality.},
	booktitle    = {Live and learn: Festschrift in honor of Lars Borin / Editors: Elena Volodina, Dana Dannélls, Aleksandrs Berdicevskis, Markus Forsberg, Shafqat Virk},
	author       = {Dannélls, Dana and Torrent, Tiago Timponi and Sigiliano, Natalia Sathler and Dobnik, Simon},
	year         = {2022},
	publisher    = {Institutionen för svenska, flerspråkighet och språkteknologi, Göteborgs universitet},
	address      = {Göteborg},
	ISBN         = {978-91-87850-83-7},
	pages        = {29--37},
}

@edited_book{volodina-etal-2022-live-320415,
	title        = {Live and Learn- Festschrift in honor of Lars Borin},
	abstract     = {This Festschrift has been compiled to honor Professor Lars Borin on his 65th anniversary. It consists of 30 articles which reflect a fraction of Lars’ scholarly interests within computational linguistics and related fields. They come from his friends and colleagues around the world and deal with topics that have been – in one way or another – inspired by his work. A common theme for the articles is the never-ending need to learn, which is alluded to in the title of the volume, Live and Learn.},
	editor       = {Volodina, Elena and Dannélls, Dana and Berdicevskis, Aleksandrs and Forsberg, Markus and Virk, Shafqat},
	year         = {2022},
	publisher    = {Institutionen för svenska, flerspråkighet och språkteknologi, Göteborgs universitet},
	address      = {Göteborg},
	ISBN         = {978-91-87850-83-7},
}

@inProceedings{adesam-etal-2019-exploring-279948,
	title        = {Exploring the Quality of the Digital Historical Newspaper Archive KubHist},
	abstract     = {The KubHist Corpus is a massive corpus of Swedish historical newspapers, digitized by the Royal Swedish library, and available through the Språkbanken corpus infrastructure Korp. This paper contains a first overview of the KubHist corpus, exploring some of the difficulties with the data, such as OCR errors and spelling variation, and discussing possible paths for improving the quality and the searchability.},
	booktitle    = {Proceedings of the 4th Conference of The Association Digital Humanities in the Nordic Countries (DHN), Copenhagen, Denmark, March 5-8, 2019},
	editor       = {Costanza Navarretta and Manex Agirrezabal and Bente Maegaard},
	author       = {Adesam, Yvonne and Dannélls, Dana and Tahmasebi, Nina},
	year         = {2019},
	publisher    = {CEUR Workshop Proceedings},
	address      = {Aachen},
}

@incollection{dannells-etal-2021-swedish-310041,
	title        = {Swedish FrameNet},
	abstract     = {This chapter describes the development of Swedish FrameNet. A new framenet project often follows one of two methodological approaches: (1) extension, through translation of a different-language – often English – framenet into the target language, and (2) merging, where the resource is built from scratch in the target language. Both approaches have their pros and cons, which have been
extensively discussed in the literature. Swedish FrameNet is mainly developed through the extension approach, although balanced with the merging approach. Drawing on the two approaches simultaneously, we describe how integrated language resources and tools have been exploited to create and develop Swedish FrameNet: how it was constructed, what it contains, and the basic assumptions underlying the annotation of its contents. },
	booktitle    = {The Swedish FrameNet++. Harmonization, integration, method development and practical language technology applications},
	author       = {Dannélls, Dana and Borin, Lars and Forsberg, Markus and Friberg Heppin, Karin and Toporowska Gronostaj, Maria},
	year         = {2021},
	publisher    = {John Benjamins Publishing Company},
	address      = {Amsterdam / Philadelphia},
	ISBN         = {978 90 272 5848 9},
	pages        = {37 -- 66},
}

@incollection{dannells-grztis-2021-computational-310047,
	title        = {Computational representation of FrameNet for multilingual natural language generation},
	abstract     = {Multilingual natural language generation, the process of producing written or spoken utterances in parallel languages from either structured or unstructured representations requires large amounts of syntactic and semantic information to generate an expression that is tailored to the target audience. This information is offered by FrameNet-like resources, which have been developed for a number of languages. In this chapter, we present a computational FrameNet grammar resource for multilingual natural language generation. We compare between English and Swedish framenets to illustrate how these can be unified under a shared computational representation using Grammatical Framework.
We demonstrate how the grammar was exploited in two practical multilingual natural language generation applications to facilitate tourist communication and empower museum users with coherent artwork descriptions.},
	booktitle    = {The Swedish FrameNet++. Harmonization, integration, method development and practical language technology applications},
	author       = {Dannélls, Dana and Grūzītis, Normunds},
	year         = {2021},
	publisher    = {John Benjamins Publishing Company },
	address      = {Amsterdam / Philadelphia },
	ISBN         = { 9789027258489 },
	pages        = {281 -- 301},
}

@inProceedings{dannells-virk-2021-supervised-310123,
	title        = {A Supervised Machine Learning Approach for Post-OCR Error Detection for Historical Text },
	abstract     = {Training machine learning models with high accuracy requires careful feature engineering, which involves finding the best feature combinations and extracting their values from the data. The task becomes extremely laborious for specific problems such as post Optical Character Recognition (OCR) error detection because of the diversity of errors in the data. In this paper we present a machine learning approach which exploits character n-gram statistics as the only feature for the OCR error detection task. Our method achieves a significant improvement over the baseline reaching state-of-the-art results of 91% and 89% F1 measure on English and Swedish datasets respectively. We report various experiments to select the appropriate machine learning algorithm and to compare our approach to previously reported traditional approaches.},
	booktitle    = {Linköping Electronic Press Workshop and Conference Collection. Selected contributions from the Eighth Swedish Language Technology Conference (SLTC-2020), 25-27 November, 2020 },
	author       = {Dannélls, Dana and Virk, Shafqat},
	year         = {2021},
	publisher    = {Linköping Electronic Press },
	address      = {Linköping},
}

@inProceedings{hansson-etal-2021-swedish-305126,
	title        = {The Swedish Winogender Dataset},
	abstract     = {We introduce the SweWinogender test set, a diagnostic dataset to measure gender bias in coreference resolution. It is modelled after the English Winogender benchmark, and is released with reference statistics on the distribution of men and women between occupations and the association between gender and occupation in modern corpus material. The paper discusses the design and creation of the dataset, and presents a small investigation of the supplementary statistics.},
	booktitle    = {Proceedings of the 23rd Nordic Conference on Computational Linguistics (NoDaLiDa), May 31 - June 2, 2021, Reykjavik, Iceland (online)},
	author       = {Hansson, Saga and Mavromatakis, Konstantinos and Adesam, Yvonne and Bouma, Gerlof and Dannélls, Dana},
	year         = {2021},
	publisher    = {Linköping University Electronic Press },
	address      = {Linköping },
	ISBN         = {978-91-7929-614-8},
}

@inProceedings{dannells-etal-2021-engine-305700,
	title        = {A Two-OCR Engine Method for Digitized Swedish Newspapers },
	abstract     = {In  this  paper  we  present  a  two-OCR  engine  method  that  was  developed  at  Kungliga  biblioteket (KB), the National Library of Sweden, for improving the correctness of the OCR for mass digitization of Swedish newspapers. To evaluate the method a reference material spanning the years 1818–2018 was prepared and manually transcribed. A quantitative evaluation was then performed against the material. In this first evaluation we experimented with word lists for different time periods. The results show that even though there was no significant overall improvement of the OCR results, some combinations of word lists are successful for certain periods and should therefore be explored further.},
	booktitle    = {Selected Papers from the CLARIN Annual Conference 2020, Linköping Electronic Conference Proceedings 180},
	author       = {Dannélls, Dana and Björk, Lars and Dirdal, Ove and Johansson, Torsten},
	year         = {2021},
	publisher    = {Linköping University Electronic Press},
	address      = {Linköping},
	ISBN         = {978-91-7929-609-4},
}

@inProceedings{dannells-simon-2020-supervised-289944,
	title        = {Supervised OCR Post-Correction of Historical Swedish Texts: What Role Does the OCR System Play?},
	abstract     = {Current approaches for post-correction of OCR errors offer solutions that are tailored to a specific OCR system. This can be problematic if the post-correction method was trained on a specific OCR
system but have to be applied on the result of another system. Whereas OCR post-correction of historical text has received much attention lately, the question of what role does the OCR system play for the post-correction method has not been addressed. In this study we explore a dataset of
400 documents of historical Swedish text which has been OCR processed by three state-of-the-art OCR systems: Abbyy Finereader, Tesseract and Ocropus. We examine the OCR results of each system and present a supervised machine learning post-correction method that tries to approach
the challenges exhibited by each system. We study the performance of our method by using three evaluation tools: PrimA, Språkbanken evaluation tool and Frontiers Toolkit. Based on the evaluation analysis we discuss the impact each of the OCR systems has on the results of the post-
correction method. We report on quantitative and qualitative results showing varying degrees of OCR post-processing complexity that are important to consider when developing an OCR post-correction method.},
	booktitle    = {Proceedings of the Digital Humanities in the Nordic Countries, 5th Conference, Riga, Latvia, October 21-23, 2020},
	editor       = {Sanita Reinsone and Inguna Skadiņa and Anda Baklāne and Jānis Daugavietis},
	author       = {Dannélls, Dana and Simon, Persson},
	year         = {2020},
	publisher    = {CEUR-WS},
}

@inProceedings{dannells-virk-2020-error-297714,
	title        = {OCR Error Detection on Historical Text Using Uni-Feature and Multi-Feature Based Machine Learning Models},
	abstract     = {Detecting errors that are caused by Optical Character Recognition (OCR) systems is a challenging task that has received much attention over the years. Recent work has explored machine learning methods using hand-crafted feature engineering, which, in addition to the difficulty in identifying the best feature combinations, is often very time and resources expensive. This raises the question: Do we always need many features to achieve better results? This is an open-ended question and its answer might depend on the task at hand. For OCR error detection, we experimented and found that interestingly a uni-feature based system conquered multi-feature based systems on a Swedish data set achieving state-of-the art results, and performed equally well on an English dataset. We also experimented to find which machine learning algorithm is more suitable for the task at hand by comparing the performance of five well-known machine learning algorithms, namely Logistic regression, Decision Trees, Bernoulli Naive Bayes, Naive Bays, and Support Vector Machines.      },
	booktitle    = {Swedish Language Technology Conference (SLTC), 25-27 November 2020, University of Gothenburg },
	author       = {Dannélls, Dana and Virk, Shafqat},
	year         = {2020},
}

@inProceedings{dannells-borin-2012-toward-156502,
	title        = {Toward language independent methodology for generating artwork descriptions – Exploring FrameNet information},
	abstract     = {Today museums and other cultural heritage institutions are increasingly storing object descriptions using semantic web domain ontologies. To make  this content accessible in a multilingual world, it will need to be conveyed in many languages, a language generation task which is domain specific and language dependent. This paper describes how semantic and syntactic information such as that provided in a framenet can contribute to solving this task. It is argued that the kind of information offered by such lexical resources enhances the output quality of a multilingual language generation application, in particular when generating domain specific content.
},
	booktitle    = {EACL 2012 workshop on Language Technology for Cultural Heritage, Social Sciences, and Humanities (LaTeCH)},
	author       = {Dannélls, Dana and Borin, Lars},
	year         = {2012},
}

@inProceedings{dannells-etal-2020-evaluation-296165,
	title        = {Evaluation of a Two-OCR Engine Method: First Results on Digitized Swedish Newspapers Spanning over nearly 200 Years},
	abstract     = {In this paper we present a two-OCR engine method that was developed at Kungliga biblioteket (KB), the National Library of Sweden, for improving the correctness of the OCR for mass digitization of Swedish newspapers. We report the first quantitative evaluation results on a material spanning over nearly 200 years. In this first evaluation phase we experimented with word lists for different time periods. Although there was no significant overall improvement of the OCR results, the evaluation shows that some combinations of word lists are successful for certain periods and should therefore be explored further. },
	booktitle    = { CLARIN Annual Conference 2020, (Virtual Event), 5-7 October, 2020. Book of Abstracts},
	author       = {Dannélls, Dana and Björk, Lars and Dirdal, Ove and Johansson, Torsten},
	year         = {2020},
}

@inProceedings{waldispuhl-etal-2020-material-293332,
	title        = {Material Philology Meets Digital Onomastic Lexicography: The NordiCon Database of Medieval Nordic Personal Names in Continental Sources},
	abstract     = {We present NordiCon, a database containing medieval Nordic personal names attested in Continental sources. The database combines formally interpreted and richly interlinked onomastic data with digitized versions of the medieval manuscripts from which the data originate and information on the tokens' context. The structure of NordiCon is inspired by other online historical given name dictionaries. It takes up challenges reported on in previous works, such as how to cover material properties of a name token and how to define lemmatization principles, and elaborates on possible solutions. The lemmatization principles for NordiCon are further developed in order to facilitate the connection to other name dictionaries and corpuses, and the integration of the database into SprÃ¥kbanken Text, an infrastructure containing modern and historical written data.},
	booktitle    = {Proceedings of The 12th Language Resources and Evaluation Conference, Marseille, 11–16 May 2020 / editors: Nicoletta Calzolari... [et. al.]},
	author       = {Waldispühl, Michelle and Dannélls, Dana and Borin, Lars},
	year         = {2020},
	publisher    = {European Language Resources Association},
	address      = {Marseille},
	ISBN         = {979-10-95546-34-4},
}

@inProceedings{dannells-etal-2019-evaluation-278761,
	title        = {Evaluation and refinement of an enhanced OCR process for mass digitisation. },
	abstract     = {Great expectations are placed on the capacity of heritage institutions to make their collections available in digital format. Datadriven research is becoming a key concept within the humanities and social sciences. Kungliga biblioteket’s (National Library of Sweden, KB)collections of digitised newspaper can thus be regarded as unique cultural data sets with information that rarely is conveyed in other media types. The digital format makes it possible to explore these resources in ways not feasible while in printed form. As texts are no longer only read but also subjected to computer based analysis the demand on the correct rendering of the original text increases. OCR technologies for converting images to machine-readable text play a fundamental part in making these resources available, but the effectiveness vary with the type of document being processed. This is evident in relation to the digitisation of newspapers where factors relating to their production, layout and paper quality often impair the OCR production. In order to improve the machine readable text, especially in relation to the digitisation of newspapers, KB initiated the development of an OCR-module where key parameters can be adjusted according to the characteristics of the material being processed. The purpose of this paper is to present the project goals and methods.},
	booktitle    = {Proceedings of the Digital Humanities in the Nordic Countries 4th Conference (DHN 2019), Copenhagen, Denmark, March 5-8, 2019. Edited by: Costanza Navarretta, Manex Agirrezabal, Bente Maegaard},
	author       = {Dannélls, Dana and Johansson, Torsten and Björk, Lars},
	year         = {2019},
	publisher    = {University of Copenhagen, Faculty of Humanities},
	address      = {Copenhagen},
}

@incollection{borin-etal-2018-linguistics-269084,
	title        = {Linguistics vs. language technology in constructicon building and use},
	abstract     = {In this chapter, we describe the close interaction of linguists and language technologists in the Swedish constructicon project. This kind of collaboration is not so common today, because of the way that language technology has developed in recent decades, but in our case the collaboration has been very successful, and constituted a genuine instance of cross-fertilization, where an evolving language technology infrastructure and a computational lexical macroresource described in the chapter has formed an integral part of the Swedish constructicon development environment, while at the same time the structured linguistic knowledge described in the constructicon has informed the language technology making up the infrastructure.},
	booktitle    = {Constructicography: Constructicon development across languages},
	editor       = {Benjamin Lyngfelt and Lars Borin and Kyoko Ohara and Tiago Timponi Torrent},
	author       = {Borin, Lars and Dannélls, Dana and Gruzitis, Normunds},
	year         = {2018},
	publisher    = {John Benjamins},
	address      = {Amsterdam},
	ISBN         = {9789027263865},
	pages        = {229--253},
}

@inProceedings{dannells-olsson-2018-integrating-271181,
	title        = {Integrating language resources in two OCR engines to improve processing of historical Swedish text.},
	abstract     = {We are aiming to address the difficulties that many History and Social Sciences researchers struggle with to bring in non-digitized text into language analysis workflows. In this paper we present the language resources and material we used for training two Optical Character Recognition engines for processing historical Swedish text written in Fraktur (blackletter). The trained models, resources and dictionaries are freely available and accessible through our web service, hosted at Språkbanken, to enable users and developers easy access for extraction of historical Swedish text a that are only available in images for further processing.},
	booktitle    = {CLARIN Annual Conference},
	author       = {Dannélls, Dana and Olsson, Leif-Jöran},
	year         = {2018},
}

@inProceedings{dannells-gruzitis-2014-extracting-198499,
	title        = {Extracting a bilingual semantic grammar from FrameNet-annotated corpora},
	abstract     = {We present the creation of an English-Swedish FrameNet-based grammar in Grammatical Framework. The aim of this research is to make existing framenets computationally accessible for multilingual natural language applications via a common semantic grammar API, and to facilitate the porting of such grammar to other languages. In this paper, we describe the abstract syntax of the semantic grammar while focusing on its automatic extraction possibilities. We have extracted a shared abstract syntax from ~58,500 annotated sentences in Berkeley FrameNet (BFN) and ~3,500 annotated sentences in Swedish FrameNet (SweFN). The abstract syntax defines 769 frame-specific valence patterns that cover 77,8% examples in BFN and 74,9% in SweFN belonging to the shared set of 471 frames. As a side result, we provide a unified method for comparing semantic and syntactic valence patterns across framenets.},
	booktitle    = {Proceedings of the 9th International Conference on Language Resources and Evaluation (LREC)},
	author       = {Dannélls, Dana and Gruzitis, Normunds},
	year         = {2014},
	publisher    = {European Language Resources Association},
	ISBN         = {978-2-9517408-8-4},
}

@inProceedings{gruzitis-etal-2016-grammatical-233921,
	title        = {Grammatical Framework for implementing multilingual frames and constructions},
	booktitle    = {Book of Abstracts. The 9th International Conference on Construction Grammar (ICCG9) theme session on Computational Semantics with Frames and Constructions. October 05-09, 2016, Juiz de Fora, Brazil },
	author       = {Gruzitis, Normunds and Dannélls, Dana and Ranta, Aarne and Tyers, Francis  M.},
	year         = {2016},
}

@techreport{borin-etal-2016-free-233768,
	title        = {A free cloud service for OCR / En fri molntjänst för OCR},
	author       = {Borin, Lars and Bouma, Gerlof and Dannélls, Dana},
	year         = {2016},
	publisher    = {University of Gothenburg},
	address      = {Göteborg},
}

@inProceedings{gruzitis-etal-2015-formalising-220419,
	title        = {Formalising the Swedish Constructicon in Grammatical Framework},
	abstract     = {This paper presents a semi-automatic approach to acquire a computational construction grammar from the semi-formal
Swedish Constructicon. The implementation is based on the resource grammar
library provided by Grammatical Framework and can be seen as an extension to
the existing Swedish resource grammar.
An important consequence of this work is
that it generates feedback, explicit and implicit, on how to improve the annotation
consistency and adequacy of the original
construction resource. },
	booktitle    = {Proceedings of the Grammar Engineering Across Frameworks (GEAF) Workshop, 53rd Annual Meeting of the ACL and 7th IJCNLP, Beijing, China, July 26-31, 2015},
	author       = {Gruzitis, Normunds and Dannélls, Dana and Lyngfelt, Benjamin and Ranta, Aarne},
	year         = {2015},
	ISBN         = {978-1-932432-66-4},
	pages        = {49----56},
}

@inProceedings{fribergheppin-dannells-2015-polysemy-218276,
	title        = {Polysemy and questions of lumping or splitting in the construction of Swedish FrameNet},
	abstract     = {When working on a lexical resource, such as Swedish FrameNet (SweFN), assumptions based on linguistic theories are made, and methodological directions based upon
them  are  taken.    These directions  often need  to  be  revised  when  not  beforehand foreseen problems arise.  One assumption that was made already in the early development  stages  of  SweFN  was  that  each lexical  entry  from  the  reference  lexicon, SALDO, would evoke only one semantic frame in SweFN. If a lexical entry evoked more than one frame, it entailed more than one sense and therefore required a new entry in the lexicon.
As  work  progressed,  this  inclination  towards  splitting,  in  the  perpetual  lumpers and splitters discussion proved to be progressively untenable. This paper will give an account of the problems which  were  encountered  and  suggestions for solutions on polysemy issues forcing a discussion on lumping or splitting.},
	booktitle    = {Proceedings of the Workshop on Semantic resources and Semantic Annotation for Natural Language Processing and the Digital Humanities at NODALIDA 2015, Vilnius, 11th May, 2015},
	author       = {Friberg Heppin, Karin and Dannélls, Dana},
	year         = {2015},
	pages        = {12--20},
}

@inProceedings{ahlberg-etal-2014-swedish-210083,
	title        = {Swedish FrameNet++ The Beginning of the End and the End of the Beginning},
	booktitle    = {Proceedings of the Fifth Swedish Language Technology Conference, Uppsala, 13-14 November 2014},
	author       = {Ahlberg, Malin and Borin, Lars and Dannélls, Dana and Forsberg, Markus and Toporowska Gronostaj, Maria and Friberg Heppin, Karin and Johansson, Richard and Kokkinakis, Dimitrios and Olsson, Leif-Jöran and Uppström, Jonatan},
	year         = {2014},
}

@article{borin-etal-2014-geographic-198286,
	title        = {Geographic visualization of place names in Swedish literary texts},
	abstract     = {This article describes the development of a geographical information system (GIS) at Språkbanken as part of a visualization solution to be used in an archive of historical Swedish literary texts. The research problems we are aiming to address concern orthographic and morphological variation, missing place names, and missing place name coordinates. Some of these problems form a central part in the development of methods and tools for the automatic analysis of historical Swedish literary texts at our research unit. We discuss the advantages and challenges of covering large-scale spelling variation in place names from different sources and in generating maps with focus on different time periods. },
	journal      = {Literary & Linguistic Computing},
	author       = {Borin, Lars and Dannélls, Dana and Olsson, Leif-Jöran},
	year         = {2014},
	volume       = {29},
	number       = {3},
	pages        = {400--404},
}

@incollection{damova-etal-2014-natural-178094,
	title        = {Natural Language Interaction with Semantic Web Knowledge Bases and Linked Open Data},
	abstract     = {Cultural heritage appears to be a very useful use case for Semantic Web technologies. The domain provides with plenty of circumstances  where linkages between different knowledge sources are required to ensure access to rich information and respond to the needs of professionals dealing with cultural heritage content. Semantic Web technologies offer the technological backbone to meet the requirement of integrating heterogeneous data easily, but they are still more adapted to be consumed by computers than by humans, especially non-engineers or developers. This chapter is about a technique which allows interaction in natural language with semantic knowledge bases. The proposed technique offers a method that allows querying a semantic repository in natural language and obtaining results from it as a coherent text. This unique solution includes several steps of transition from natural language to SPARQL and from RDF to coherent multilingual descriptions, using the Grammatical Framework, GF. The approach builds on a semantic knowledge infrastructure in RDF, it is based on OWLIM-SE and the data integration    method Reason-able View supplied with an ontological reference layer. The latter is connected via formal rules with abstract representations derived from the syntactic trees of natural language input using the GF resource grammar library.
},
	booktitle    = {Towards multilingual Semantic Web},
	author       = {Damova, Mariana and Dannélls, Dana and Mateva, Maria and Enache, Ramona and Ranta, Aarne},
	year         = {2014},
	publisher    = {Springer},
	address      = {Berlin},
	ISBN         = {978-3-662-43585-4},
	pages        = {211--226},
}

@inProceedings{dannells-etal-2014-multilingual-204733,
	title        = {A Multilingual SPARQL-Based Retrieval Interface for Cultural Heritage Objects},
	booktitle    = {Proceedings of the ISWC 2014 Posters & Demonstrations Track a track within the 13th International Semantic Web Conference (ISWC 2014)},
	author       = {Dannélls, Dana and Enache, Ramona and Damova, Mariana},
	year         = {2014},
	volume       = {1272},
	pages        = {205--208},
}

@inProceedings{borin-etal-2014-representing-204731,
	title        = {Representing Swedish Lexical Resources in RDF with lemon},
	abstract     = {The paper presents an ongoing project which aims to publish Swedish lexical-semantic resources using Semantic Web and Linked Data technologies. In this article, we highlight the practical conversion methods and challenges of
converting three of the Swedish language resources in RDF with lemon.},
	booktitle    = { Proceedings of the ISWC 2014 Posters & Demonstrations Track a track within the 13th International Semantic Web Conference (ISWC 2014)},
	author       = {Borin, Lars and Dannélls, Dana and Forsberg, Markus and McCrae, John P.},
	year         = {2014},
	volume       = {1272 },
	pages        = {329--332},
}

@inProceedings{dannells-etal-2014-using-201951,
	title        = {Using language technology resources and tools to construct Swedish FrameNet},
	abstract     = {Having access to large lexical and grammatical resources when creating a
new language resource
is essential for its enhancement and enrichment. This paper describes the
interplay and interac-
tive utilization of different language technology tools and resources, in p
articular the Swedish
lexicon SALDO and Swedish Constructicon, in the creation of Swedish Frame
Net. We show
how integrating resources in a larger infrastructure is much more than the su
m of the parts. },
	booktitle    = {Proceedings of the Workshop on Lexical and Grammatical Resources for Language Processing, Dublin Ireland, August 24, 2014},
	author       = {Dannélls, Dana and Friberg Heppin, Karin and Ehrlemark, Anna},
	year         = {2014},
	ISBN         = {978-1-873769-44-7},
	pages        = {8--17},
}

@inProceedings{dannells-gruzitis-2014-controlled-201944,
	title        = {Controlled Natural Language Generation from a Multilingual FrameNet-based Grammar},
	abstract     = {This paper presents a currently bilingual but potentially multilingual FrameNet-based grammar library implemented in Grammatical Framework. The contribution of this paper is two-fold. First, it offers a methodological approach to automatically generate the grammar based on semantico-syntactic valence patterns extracted from FrameNet-annotated corpora. Second, it provides a proof of concept for two use cases illustrating how the acquired multilingual grammar can be exploited in different CNL applications in the domains of arts and tourism.},
	booktitle    = {Lecture Notes in Computer Science},
	author       = {Dannélls, Dana and Gruzitis, Normunds},
	year         = {2014},
	volume       = {8625},
	ISBN         = {978-3-319-10222-1},
	pages        = {155--166},
}

@inProceedings{dannells-camilleri-2010-verb-119938,
	title        = {Verb Morphology of Hebrew and Maltese - Towards an Open Source Type Theoretical Resource Grammar in GF.},
	abstract     = {One of the first issues that a programmer must tackle when writing a complete computer program that processes natural language is
how to design the morphological component. A typical morphological component should cover three main aspects in a given language:
(1) the lexicon, i.e. how morphemes are encoded, (2) orthographic changes, and (3) morphotactic variations. This is in particular
challenging when dealing with Semitic languages because of their non-concatenative morphology called root-and pattern morphology.
In this paper we describe the design of two morphological components for Hebrew and Maltese verbs in the context of the Grammatical
Framework (GF). The components are implemented as a part of larger grammars and are currently under development. We found that
although Hebrew and Maltese share some common characteristics in their morphology, it seems difficult to generalize morphosyntactic
rules across Semitic verbs when the focus is towards computational linguistics motivated lexicons. We describe and compare the verb
morphology of Hebrew and Maltese and motivate our implementation efforts towards a complete open source type theoretical resource
grammars for Semitic languages. Future work will focus on semantic aspects of morphological processing.},
	booktitle    = {Proceedings of LREC 2010. Workshop on Language Resources (LRs) and Human Language Technologies (HLT) for Semitic Languages Status, Updates, and Prospects.},
	author       = {Dannélls, Dana and Camilleri, John J.},
	year         = {2010},
}

@misc{dannells-etal-2013-grammar-189699,
	title        = {Grammar-ontology interoperability -- Final Work and Overview},
	abstract     = {D4.3A is an annex to the D4.3 deliverable of WP4 of the MOLTO project. It aims to
address the reviewers’ remarks and recommendations for D4.3, as well as to present a final overview of the prototypes built in the scope of MOLTO with respect to
grammar-ontology interoperabilty. D4.3A also describes the work after M24 and gives a
general overview of the achievements in MOLTO with focus on WP4 - Knowledge
Engineering, WP7 - Patents use case, and WP8 - Cultural Heritage use case.
},
	author       = {Dannélls, Dana and Ranta, Aarne and Enache, Ramona and Listenmaa, Inari and Tolosi, Laura and Mateva, Maria},
	year         = {2013},
	publisher    = {University of Gothenburg},
	address      = {Göteborg},
}

@inProceedings{caprotti-etal-2012-high-178183,
	title        = {High-quality translation: Molto tools and applications},
	abstract     = {MOLTO (Multilingual On Line Translation, FP7-ICT-247914, www.molto-project.eu) is a European project focusing on translation on the web. MOLTO targets translation that has production quality, that is, usable for quick and reliable dissemination of information. MOLTO’s main focus is to increase the productivity of such translation systems, building on the technology of GF (Grammatical Framework) and its Resource Grammar Library. But MOLTO also develops hybrid methods which increase
the quality of Statistical Machine Translation (SMT) by adding linguistic information, or bootstrap grammatical models from statistical models. This paper gives a brief overview of MOLTO’s latest achievements, many of which are more thoroughly described in separate papers and available as web-based demos and as open-source software.},
	booktitle    = {The fourth Swedish Language Technology Conference (SLTC)},
	author       = {Caprotti, Olga and Ranta, Aarne and Angelov, Krasimir and Enache, Ramona and Camilleri, John J. and Dannélls, Dana and Détrez, Grégoire and Hallgren, Thomas and Prasad, K. V. S. and Virk, Shafqat},
	year         = {2012},
}

@misc{dannells-etal-2013-translation-189698,
	title        = {Translation and retrieval system for museum object descriptions},
	abstract     = {This is the final report of Workpackage 8: Case Study: Cultural Heritage. The major
contributions reported are ontology-based multilingual grammar covering 15 languages
and cross-language retrieval system for museum object descriptions using Semantic Web
technology. Our groundwork for this deliverable was laid in D8.1: Ontology and corpus
study of the cultural heritage domain, and D8.2: Multilingual grammar for museum
object descriptions.
},
	author       = {Dannélls, Dana and Ranta, Aarne and Enache, Ramona and Damova, Mariana and Mateva, Maria},
	year         = {2013},
	publisher    = {University of Gothenburg},
	address      = {Göteborg},
}

@techreport{dannells-2010-mapserver-179443,
	title        = {MapServer at Språkbanken},
	author       = {Dannélls, Dana},
	year         = {2010},
	publisher    = {University of Gothenburg},
	address      = {Göteborg},
}

@inProceedings{dannells-etal-2013-multilingual-178096,
	title        = {Multilingual access to cultural heritage content on the Semantic Web},
	abstract     = {As the amount of cultural data available on the Semantic Web is expanding, the demand of accessing this data in multiple languages is increasing. Previous work on multilingual access to cultural heritage information has shown that mapping from ontologies to natural language requires at least two different steps: (1) mapping multilingual metadata to interoperable knowledge sources; (2) assigning multilingual knowledge to cultural data. This paper presents our work on making
cultural heritage content available on
the Semantic Web and accessible in 15  languages. The objective of our work is
both to form queries and to retrieve semantic content in multiple languages. We describe our experiences with processing museum data extracted from two different sources, harmonizing this data and making its content accessible in natural language.
},
	booktitle    = {Language Technology for Cultural Heritage, Social Sciences, and Humanities (LaTeCH)},
	author       = {Dannélls, Dana and Ranta, Aarne and Enache, Ramona and Damova, Mariana and Mateva, Maria},
	year         = {2013},
}

@inProceedings{dannells-etal-2013-mapserver-178095,
	title        = {MapServer for Swedish Language Technology},
	abstract     = {The MapServer application used by the Swedish Language Bank provides new opportunities for 
visualizing geographical information found in its large repository of written texts, in particular 
literary texts. The application is capable of  performing coordinate search on the basis of  
recognized  place names and rendering both static and dynamic maps that display their 
geographical locations.
},
	booktitle    = {Digital Humanities},
	author       = {Dannélls, Dana and Borin, Lars and Olsson, Leif-Jöran},
	year         = {2013},
}

@inProceedings{dannells-2012-generating-156504,
	title        = {On generating coherent multilingual descriptions of museum objects from Semantic Web ontologies},
	abstract     = {During the last decade, there has been a shift from developing natural language systems to developing generic systems that are capable of producing natural language descriptions directly from Web ontologies. To make these descriptions coherent and accessible in different languages, a methodology is needed for identifying the general principles that would determine the distribution of referential forms. Previous work has proved through crosslinguistic investigations that strategies for building co-reference are language dependent. However, to our knowledge, there is no language generation methodology that makes a distinction between languages about the  generation of referential chains. To determine the principles governing referential chains, we  gathered data from three languages: English, Swedish and Hebrew, and studied how co-reference is expressed in a discourse. As a result of the study, a set of language specific co-reference strategies were identified. Using these strategies, an ontology based multilingual grammar for generating written natural language descriptions about paintings was implemented in the Grammatical Framework. A preliminary evaluation of our method shows language-dependent coreference strategies lead to better generation results. },
	booktitle    = {The 7th International Conference on Natural Language Generation (INLG 2012)},
	author       = {Dannélls, Dana},
	year         = {2012},
}

@book{dannells-2012-multilingual-178092,
	title        = {Multilingual text generation from structured formal representations. },
	abstract     = {This thesis aims to identify the optimal ways in which natural language generation techniques can be brought to bear upon the problem of  processing a structured body of information in  order to devise a coherent presentation of text content in multiple languages. We investigate how chains of referential expressions are  realized in English, Swedish and Hebrew, and   suggest several coreference strategies that can be used to generate coherent descriptions about paintings. The suggested strategies focus on the need to produce paragraph-sized written natural language descriptions from formal structured representations presented in the Semantic Web. We account for principles of coreference by introducing a new modularized approach to automatically generate chains of referential expressions from ontologies. We demonstrate the feasibility of the approach by implementing a system where a Semantic Web domain ontology serves as the background knowledge representation and where
the language-specific coreference strategies are incorporated. The system uses both the principles of discourse structures and coreference strategies to guide the generation process. We show how the system successfully generates coherent, well-formed descriptions in multiple languages.},
	author       = {Dannélls, Dana},
	year         = {2012},
	publisher    = {University of Gothenburg},
	address      = {Göteborg},
	ISBN         = {978-91-87850-48-6},
}

@inProceedings{dannells-etal-2011-framework-145395,
	title        = {A Framework for Improved Access to Museum Databases in the Semantic Web},
	abstract     = {Digital museum databases have extremely heterogeneous data structures which require advanced mapping and vocabulary integration for them to benefit from the interoperability enabled by semantic technologies. In addition to establishing ways of extracting and manipulating digitally
encoded cultural material, there exists a need to make this material available and accessible to human users in different forms and languages that are available to them. In this paper we describe a method to manage and access museum data by integrating it
within a series of interlinked ontological models. The method allows querying and generation of query results in natural language. We report on the results of applying this method from experiments we have been pursuing. 
},
	booktitle    = {Language Technologies for Digital Humanities and Cultural Heritage (RANLPDigHum 2011)},
	author       = {Dannélls, Dana and Damova, Mariana and Enache, Ramona and Chechev, Milen},
	year         = {2011},
	ISBN         = {978-954-452-019-9},
	pages        = {3--10},
}

@inProceedings{dannells-damova-2011-reason-145391,
	title        = {Reason-able View of Linked Data for Cultural Heritage},
	abstract     = {This paper presents a novel approach that relies on the innovative idea of Reason-able View of the Web of linked data applied to the domain of cultural heritage. We describe an application of data integration based on Semantic Web technologies and the methods necessary to create an integrated semantic knowledge base composed of real museum data that are interlinked with data from the Linked Open Data (LOD) cloud. Thus,  creating an infrastructure to allow for easy extension of the domain specific data, and
convenient querying of multiple datasets. Our approach is based on a model of schema level and an instance level alignment. The models use several ontologies, e.g. PROTON and CIDOC-CRM, showing their integration by
using real data from the Gothenburg City Museum.
},
	booktitle    = {Advances in Intelligent and Soft Computing  / The Third International Conference on Software, Services & Semantic Technologies (S3T)},
	author       = {Dannélls, Dana and Damova, Mariana},
	year         = {2011},
	volume       = {101},
	pages        = {17--24},
}

@article{dannells-2010-discourse-110876,
	title        = {Discourse Generation from Formal Specifications Using the Grammatical Framework, GF},
	abstract     = {Semantic web ontologies contain structured information that do not have discourse structure embedded in them. Hence, it becomes increasingly hard to devise multilingual texts that humans comprehend.
In this paper we show how to generate coherent multilingual texts from formal representations using discourse strategies. We demonstrate how discourse structures are mapped to GF’s abstract grammar specifications from which multilingual descriptions of work of art objects are generated automatically.
},
	journal      = {Special issue of the journal Research in Computing Science (RCS)},
	author       = {Dannélls, Dana},
	year         = {2010},
	volume       = {46},
	pages        = {167--178},
}

@inProceedings{dannells-2010-applying-121404,
	title        = {Applying semantic frame theory to automate natural language templates generation from ontology statements},
	abstract     = {Today there exist a growing number of
framenet-like resources offering semantic
and syntactic phrase specifications that can be exploited by natural language generation systems. In this paper we present on-going work that provides a starting point for exploiting framenet information for multilingual natural language generation. We describe the kind of information offered by modern computational lexical resources and discuss how template-based generation systems can benefit from them.},
	booktitle    = {The 6th International Natural Language Generation Conference},
	author       = {Dannélls, Dana},
	year         = {2010},
}

@inProceedings{dannells-etal-2012-multilingual-156501,
	title        = {Multilingual Online Generation from Semantic Web Ontologies},
	abstract     = {In this paper we report on our ongoing work in the EU project Multilingual Online Translation (MOLTO), supported by the European Union Seventh Framework
Programme under grant agreement FP7-ICT-247914. More specifically, we present work workpackage 8 (WP8): Case
Study: Cultural Heritage. The objective of the work is to build an ontology-based multilingual application for museum
information on the Web. Our approach relies on the innovative idea of Reason-able View of the Web of linked data applied to the
domain of cultural heritage. We have been developing a Web application that uses Semantic Web ontologies for generating coherent multilingual natural language descriptions about museum objects.
We have been experimenting with museum data to test our approach and find that it performs well for the examined languages. },
	booktitle    = {The World Wide Web Conference (WWW2012), 16th-20th April 2012},
	author       = {Dannélls, Dana and Enache, Ramona and Mariana, Damova and Milen, Chechev},
	year         = {2012},
	pages        = {239--242},
}

@inProceedings{dannells-2006-automatic-66478,
	title        = {Automatic Acronym Recognition},
	abstract     = {This paper deals with the problem
of recognizing and extracting acronym- definition pairs in Swedish medical texts.
This project applies a rule-based method to solve the acronym recognition task and compares and evaluates the results of different machine learning algorithms on the same task. The method proposed is based on the approach that acronym-definition pairs follow a set of patterns and other regularities that can be usefully applied for the acronym identification task. Supervised machine learning was applied to monitor the performance of the rule-based method, using Memory Based Learning (MBL). The rule-based algorithm was evaluated on a hand tagged acronym corpus and performance was measured using standard measures recall, precision and f-score.
The results show that performance
could further improve by increasing the training set and modifying the input settings for the machine learning algorithms. An analysis of the errors produced indicates that further improvement of the rule-based method requires the use of syntactic information and textual pre-processing.},
	booktitle    = {Proceedings of the 11th conference on European chapter of the Association for Computational Linguistics (EACL)},
	author       = {Dannélls, Dana},
	year         = {2006},
	ISBN         = {1-932432-59-0},
}

@inProceedings{dannells-2008-production-73693,
	title        = {The production of documents from ontologies},
	abstract     = {The production of documents from an ontology is a challenging
task which requires a significant effort from a natural language
generator. Addressing this problem involves a careful examination
of how the knowledge formalized in an ontology can be verbalized
and realized.We have started to exploit the abilities of generating
natural language texts from a Web Ontology Language (OWL)
and to examine how the content of the ontology can be rendered in
natural language texts that support reader and listener preferences. In
this paper we present our line of research and exemplify some of the
difficulties we encountered while attempting to generate fragments
of texts from a domain specific ontology.},
	booktitle    = {Proceedings of the 18th European Conference on Artificial Intelligence (ECAI). Workshop on Contexts and Ontologies, Patras, Greece.},
	author       = {Dannélls, Dana},
	year         = {2008},
	pages        = {36--38},
}

@inProceedings{borin-etal-2010-past-110368,
	title        = {The past meets the present in Swedish FrameNet++},
	abstract     = {The paper is about a recently initiated project which aims at the development of a Swedish FrameNet as an integral part of a larger lexical resource, hence the name “Swedish FrameNet++” (SweFN++). It focuses on reuse of free electronic resources and their role in the acquisition and population of Swedish frames. After a brief overview of Swedish resources, we reflect on three approaches to recycling the available lexical data in a semi-automatic manner. SweFN++ will be a multi-functional resource supporting research within lexicology and linguistics as well as different applications within computational lexicography and language technology, not to mention e-science.},
	booktitle    = {14th EURALEX International Congress},
	author       = {Borin, Lars and Dannélls, Dana and Forsberg, Markus and Toporowska Gronostaj, Maria and Kokkinakis, Dimitrios},
	year         = {2010},
	pages        = {269--281},
}

@article{borin-etal-2010-swedish-129126,
	title        = {Swedish FrameNet++},
	journal      = {Swedish Language Technology Conference 2010},
	author       = {Borin, Lars and Dannélls, Dana and Forsberg, Markus and Toporowska Gronostaj, Maria and Kokkinakis, Dimitrios},
	year         = {2010},
}

@inProceedings{borin-etal-2009-thinking-110343,
	title        = {Thinking Green: Toward Swedish FrameNet++},
	abstract     = {Access to multi-layered lexical, grammatical and semantic information representing
text content is a prerequisite for efficient automatic understanding
and generation of natural language. A FrameNet is considered a valuable
resource for both linguistics and language technology research that
may contribute to the achievement of these goals.
Currently, FrameNet-like resources exist for a few languages,1 including
some domain-specific and multilingual initiatives (Dolbey et al., 2006;
Boas, 2009; Uematsu et al., 2009; Venturi et al., 2009), but are unavailable
for most languages, including Swedish, although there have been some
pilot studies exploring the semi-automatic acquisition of Swedish frames
(Johansson & Nugues, 2006; Borin et al., 2007).
At the University of Gothenburg, we are now embarking on a project to
build a Swedish FrameNet-like resource. A novel feature of this project is
that the Swedish FrameNetwill be an integral part of a largermany-faceted
lexical resource. Hence the name Swedish FrameNet++ (SweFN++).
},
	booktitle    = {FrameNet Masterclass and Workshop},
	author       = {Borin, Lars and Dannélls, Dana and Forsberg, Markus and Toporowska Gronostaj, Maria and Kokkinakis, Dimitrios},
	year         = {2009},
}

@inProceedings{dannells-2009-improving-104122,
	title        = { Improving Information Access to Cultural Content through Discourse Strategies.},
	abstract     = {This paper describes a grammar driven approach for generating multilingual cultural heritage information of objects held by museums and galleries. Discourse strategies are utilized to select and organize onto-
logical statements. The discourse structure is translated to abstract grammar specifications that are mapped to natural language. 
},
	booktitle    = {Workshop proceedings of the eleventh International Conference of the Italian Association for Artificial Intelligence (AI*IA)},
	author       = {Dannélls, Dana},
	year         = {2009},
	ISBN         = {978-88-903581-1-1},
}

@inProceedings{dannells-2008-system-73695,
	title        = {A System Architecture for Conveying Historical Knowledge to Museum Visitors},
	abstract     = {One of the requirements posed by cultural organizations is
how to accommodate cultural content of formal ontology object descriptions
to different user needs. This paper introduces a personal museum
guide system architecture that is being developed to exploit linguistic aspects
of realization of a domain-specific ontology in relation to the user’s
interaction with this ontology.},
	booktitle    = {Proceedings of ECDL 2008, Århus, Danmark},
	author       = {Dannélls, Dana},
	year         = {2008},
	ISBN         = {978-90-813489-1-1},
}

@inProceedings{dannells-2009-value-95195,
	title        = {The Value of Weights in Automatically Generated Text Structures},
	abstract     = {One question that arises if we want to evolve generation techniques
to accommodate Web ontologies is how to capture and expose the
relevant ontology content to the user. This paper presents an attempt
to answer the question about how to select the ontology statements that
are significant for the user and present those statements in a way that
helps the user to learn. Our generation approach combines bottom-up
and top-down techniques with enhanced comparison methods to tailor
descriptions about a concept described in an ontology. A preliminary
evaluation indicates that the process of computing preferable property
weights in addition to enhanced generation methods has a positive effect
on the text structure and its content. Future work aims to assign
grammar rules and lexical entries in order to produce coherent texts that
follow on from the generated text structures in several languages.},
	booktitle    = {Proceedings of the 10th International Conference on Intelligent Text Processing and Computational Linguistics},
	author       = {Dannélls, Dana},
	year         = {2009},
	number       = {LNCS 5449},
	pages        = {233--244},
}

@inProceedings{dannells-2008-generating-73692,
	title        = {Generating Tailored Texts for Museum Exhibits},
	abstract     = {This paper reports work that aims to generate texts in multiple languages from ontologies following the Conceptual Reference Model
(CRM) ISO standard for conceptual models of museums. The rationale of this work is to increase users’ knowledge and interest in
the cultural heritage domain by allowing the user to select his preferable syntax presentation and influence the order of the generated
information using generation techniques and Semantic Web technologies. We chose for study a small amount of logical relations represented
in the ontology and wrote a grammar that is capable to describe them in natural language through user editing. We present the
multilingual source authoring environment, which is built upon the grammatical framework (GF) formalism and show how it is utilized
to generate multiple texts from the CRM domain ontology. The initial results comprise texts, which vary in syntax and content.},
	booktitle    = {Proceedings of the 6th edition of LREC 2008, Workshop on Language Technology for Cultural Heritage Data (LaTeCH), Marrakech, Morocco.},
	author       = {Dannélls, Dana},
	year         = {2008},
	pages        = {17--20},
}

@inProceedings{dannells-deleger-2007-multilingual-66462,
	title        = {Multilingual generation of medical information},
	abstract     = {Multilingual generation systems aim to produce understandable texts in multiple languages from one knowledge representation. We adapted an existing
prototype multilingual generator that presents simulated breast cancer Electronic Health Records (EHRs) in English to French and Swedish. The purpose of this work was to test how much effort
it would require to modify this limited-domain, template-based English generator to enable it to generate in French and Swedish. We describe the
adaptation to both languages, viewing the grammatical aspects involved and explaining the modifications performed. This work illustrates how the same
underlying knowledge representation can be used to generate output texts in multiple languages with only minor linguistic modifications.},
	booktitle    = {In the 9th Bar-Ilan Symposium on the Foundations of Artificial Intelligence (BISFAI) },
	author       = {Dannélls, Dana and Deléger, Louise},
	year         = {2007},
}

@inProceedings{kokkinakis-dannells-2006-recognizing-33936,
	title        = {Recognizing Acronyms and their Definitions in Swedish Medical Texts},
	abstract     = {This paper addresses the task of recognizing acronym-definition pairs in Swedish (medical) texts as well as the compilation of a freely 
available sample of such manually annotated pairs. A material suitable not only for supervised learning experiments, but also as 
a testbed for the evaluation of the quality of future acronym-definition recognition systems. There are a number of approaches to 
the identification described in the literature, particularly within the biomedical domain, but none of those addresses the variation and 
complexity exhibited in a language other than English. This is realized by the fact that we can have a mixture of two languages in
 the same document and/or sentence, i.e. Swedish and English; that Swedish is a compound language that significantly deteriorates 
the performance of previous approaches (without adaptations) and, most importantly, the fact that there is a large variation of 
possible acronym-definition permutations realized in the analysed corpora, a variation that is usually ignored in previous studies. 
},
	booktitle    = {roceedings of the 5th Languages Resources and Evalutaion (LREC). },
	author       = {Kokkinakis, Dimitrios and Dannélls, Dana},
	year         = {2006},
}
Page manager: sb-webb