@techreport{Ljunglöf-Peter2019-281222, title = {Assessing the quality of Språkbanken’s annotations}, abstract = {Most of the corpora in Språkbanken Text consist of unannotated plain text, such as almost all newspaper texts, social media texts, novels and official documents. We also have some corpora that are manually annotated in different ways, such as Talbanken (annotated for part-of-speech and syntactic structure), and the Stockholm Umeå Corpus (annotated for part-of-speech). Språkbanken’s annotation pipeline Sparv aims to automatise the work of automatically annotating all our corpora, while still keeping the manual annotations intact. When all corpora are annotated, they can be made available, e.g., in the corpus searh tools Korp and Strix. Until now there has not been any comprehensive overview of the annotation tools and models that Sparv has been using for the last eight years. Some of them have not been updated since the start, such as the part-of-speech tagger Hunpos and the dependency parser MaltParser. There are also annotation tools that we still have not included, such as a constituency-based parser. Therefore Språkbanken initiated a project with the aim of conducting such an overview. This document is the outcome of that project, and it contains descriptions of the types of manual and automatic annotations that we currently have in Språkbanken, as well as an incomplete overview of the state-of-the-art with regards to annotation tools and models. }, author = {Ljunglöf, Peter and Zechner, Niklas and Nieto Piña, Luis and Adesam, Yvonne and Borin, Lars}, year = {2019}, } @inProceedings{Adesam-Yvonne2019-279948, title = {Exploring the Quality of the Digital Historical Newspaper Archive KubHist}, abstract = {The KubHist Corpus is a massive corpus of Swedish historical newspapers, digitized by the Royal Swedish library, and available through the Språkbanken corpus infrastructure Korp. This paper contains a first overview of the KubHist corpus, exploring some of the difficulties with the data, such as OCR errors and spelling variation, and discussing possible paths for improving the quality and the searchability.}, booktitle = {Proceedings of the 4th Conference of The Association Digital Humanities in the Nordic Countries (DHN)}, author = {Adesam, Yvonne and Dannélls, Dana and Tahmasebi, Nina}, year = {2019}, } @techreport{Ljunglöf-Peter2019-281222, title = {Assessing the quality of Språkbanken’s annotations}, abstract = {Most of the corpora in Språkbanken Text consist of unannotated plain text, such as almost all newspaper texts, social media texts, novels and official documents. We also have some corpora that are manually annotated in different ways, such as Talbanken (annotated for part-of-speech and syntactic structure), and the Stockholm Umeå Corpus (annotated for part-of-speech). Språkbanken’s annotation pipeline Sparv aims to automatise the work of automatically annotating all our corpora, while still keeping the manual annotations intact. When all corpora are annotated, they can be made available, e.g., in the corpus searh tools Korp and Strix. Until now there has not been any comprehensive overview of the annotation tools and models that Sparv has been using for the last eight years. Some of them have not been updated since the start, such as the part-of-speech tagger Hunpos and the dependency parser MaltParser. There are also annotation tools that we still have not included, such as a constituency-based parser. Therefore Språkbanken initiated a project with the aim of conducting such an overview. This document is the outcome of that project, and it contains descriptions of the types of manual and automatic annotations that we currently have in Språkbanken, as well as an incomplete overview of the state-of-the-art with regards to annotation tools and models. }, author = {Ljunglöf, Peter and Zechner, Niklas and Nieto Piña, Luis and Adesam, Yvonne and Borin, Lars}, year = {2019}, } @inProceedings{Adesam-Yvonne2019-279948, title = {Exploring the Quality of the Digital Historical Newspaper Archive KubHist}, abstract = {The KubHist Corpus is a massive corpus of Swedish historical newspapers, digitized by the Royal Swedish library, and available through the Språkbanken corpus infrastructure Korp. This paper contains a first overview of the KubHist corpus, exploring some of the difficulties with the data, such as OCR errors and spelling variation, and discussing possible paths for improving the quality and the searchability.}, booktitle = {Proceedings of the 4th Conference of The Association Digital Humanities in the Nordic Countries (DHN)}, author = {Adesam, Yvonne and Dannélls, Dana and Tahmasebi, Nina}, year = {2019}, } @techreport{Ljunglöf-Peter2019-281222, title = {Assessing the quality of Språkbanken’s annotations}, abstract = {Most of the corpora in Språkbanken Text consist of unannotated plain text, such as almost all newspaper texts, social media texts, novels and official documents. We also have some corpora that are manually annotated in different ways, such as Talbanken (annotated for part-of-speech and syntactic structure), and the Stockholm Umeå Corpus (annotated for part-of-speech). Språkbanken’s annotation pipeline Sparv aims to automatise the work of automatically annotating all our corpora, while still keeping the manual annotations intact. When all corpora are annotated, they can be made available, e.g., in the corpus searh tools Korp and Strix. Until now there has not been any comprehensive overview of the annotation tools and models that Sparv has been using for the last eight years. Some of them have not been updated since the start, such as the part-of-speech tagger Hunpos and the dependency parser MaltParser. There are also annotation tools that we still have not included, such as a constituency-based parser. Therefore Språkbanken initiated a project with the aim of conducting such an overview. This document is the outcome of that project, and it contains descriptions of the types of manual and automatic annotations that we currently have in Språkbanken, as well as an incomplete overview of the state-of-the-art with regards to annotation tools and models. }, author = {Ljunglöf, Peter and Zechner, Niklas and Nieto Piña, Luis and Adesam, Yvonne and Borin, Lars}, year = {2019}, } @inProceedings{Adesam-Yvonne2019-279948, title = {Exploring the Quality of the Digital Historical Newspaper Archive KubHist}, abstract = {The KubHist Corpus is a massive corpus of Swedish historical newspapers, digitized by the Royal Swedish library, and available through the Språkbanken corpus infrastructure Korp. This paper contains a first overview of the KubHist corpus, exploring some of the difficulties with the data, such as OCR errors and spelling variation, and discussing possible paths for improving the quality and the searchability.}, booktitle = {Proceedings of the 4th Conference of The Association Digital Humanities in the Nordic Countries (DHN)}, author = {Adesam, Yvonne and Dannélls, Dana and Tahmasebi, Nina}, year = {2019}, } @techreport{Ljunglöf-Peter2019-281222, title = {Assessing the quality of Språkbanken’s annotations}, abstract = {Most of the corpora in Språkbanken Text consist of unannotated plain text, such as almost all newspaper texts, social media texts, novels and official documents. We also have some corpora that are manually annotated in different ways, such as Talbanken (annotated for part-of-speech and syntactic structure), and the Stockholm Umeå Corpus (annotated for part-of-speech). Språkbanken’s annotation pipeline Sparv aims to automatise the work of automatically annotating all our corpora, while still keeping the manual annotations intact. When all corpora are annotated, they can be made available, e.g., in the corpus searh tools Korp and Strix. Until now there has not been any comprehensive overview of the annotation tools and models that Sparv has been using for the last eight years. Some of them have not been updated since the start, such as the part-of-speech tagger Hunpos and the dependency parser MaltParser. There are also annotation tools that we still have not included, such as a constituency-based parser. Therefore Språkbanken initiated a project with the aim of conducting such an overview. This document is the outcome of that project, and it contains descriptions of the types of manual and automatic annotations that we currently have in Språkbanken, as well as an incomplete overview of the state-of-the-art with regards to annotation tools and models. }, author = {Ljunglöf, Peter and Zechner, Niklas and Nieto Piña, Luis and Adesam, Yvonne and Borin, Lars}, year = {2019}, } @inProceedings{Adesam-Yvonne2019-279948, title = {Exploring the Quality of the Digital Historical Newspaper Archive KubHist}, abstract = {The KubHist Corpus is a massive corpus of Swedish historical newspapers, digitized by the Royal Swedish library, and available through the Språkbanken corpus infrastructure Korp. This paper contains a first overview of the KubHist corpus, exploring some of the difficulties with the data, such as OCR errors and spelling variation, and discussing possible paths for improving the quality and the searchability.}, booktitle = {Proceedings of the 4th Conference of The Association Digital Humanities in the Nordic Countries (DHN)}, author = {Adesam, Yvonne and Dannélls, Dana and Tahmasebi, Nina}, year = {2019}, } @techreport{Ljunglöf-Peter2019-281222, title = {Assessing the quality of Språkbanken’s annotations}, abstract = {Most of the corpora in Språkbanken Text consist of unannotated plain text, such as almost all newspaper texts, social media texts, novels and official documents. We also have some corpora that are manually annotated in different ways, such as Talbanken (annotated for part-of-speech and syntactic structure), and the Stockholm Umeå Corpus (annotated for part-of-speech). Språkbanken’s annotation pipeline Sparv aims to automatise the work of automatically annotating all our corpora, while still keeping the manual annotations intact. When all corpora are annotated, they can be made available, e.g., in the corpus searh tools Korp and Strix. Until now there has not been any comprehensive overview of the annotation tools and models that Sparv has been using for the last eight years. Some of them have not been updated since the start, such as the part-of-speech tagger Hunpos and the dependency parser MaltParser. There are also annotation tools that we still have not included, such as a constituency-based parser. Therefore Språkbanken initiated a project with the aim of conducting such an overview. This document is the outcome of that project, and it contains descriptions of the types of manual and automatic annotations that we currently have in Språkbanken, as well as an incomplete overview of the state-of-the-art with regards to annotation tools and models. }, author = {Ljunglöf, Peter and Zechner, Niklas and Nieto Piña, Luis and Adesam, Yvonne and Borin, Lars}, year = {2019}, } @inProceedings{Adesam-Yvonne2019-279948, title = {Exploring the Quality of the Digital Historical Newspaper Archive KubHist}, abstract = {The KubHist Corpus is a massive corpus of Swedish historical newspapers, digitized by the Royal Swedish library, and available through the Språkbanken corpus infrastructure Korp. This paper contains a first overview of the KubHist corpus, exploring some of the difficulties with the data, such as OCR errors and spelling variation, and discussing possible paths for improving the quality and the searchability.}, booktitle = {Proceedings of the 4th Conference of The Association Digital Humanities in the Nordic Countries (DHN)}, author = {Adesam, Yvonne and Dannélls, Dana and Tahmasebi, Nina}, year = {2019}, } @techreport{Ljunglöf-Peter2019-281222, title = {Assessing the quality of Språkbanken’s annotations}, abstract = {Most of the corpora in Språkbanken Text consist of unannotated plain text, such as almost all newspaper texts, social media texts, novels and official documents. We also have some corpora that are manually annotated in different ways, such as Talbanken (annotated for part-of-speech and syntactic structure), and the Stockholm Umeå Corpus (annotated for part-of-speech). Språkbanken’s annotation pipeline Sparv aims to automatise the work of automatically annotating all our corpora, while still keeping the manual annotations intact. When all corpora are annotated, they can be made available, e.g., in the corpus searh tools Korp and Strix. Until now there has not been any comprehensive overview of the annotation tools and models that Sparv has been using for the last eight years. Some of them have not been updated since the start, such as the part-of-speech tagger Hunpos and the dependency parser MaltParser. There are also annotation tools that we still have not included, such as a constituency-based parser. Therefore Språkbanken initiated a project with the aim of conducting such an overview. This document is the outcome of that project, and it contains descriptions of the types of manual and automatic annotations that we currently have in Språkbanken, as well as an incomplete overview of the state-of-the-art with regards to annotation tools and models. }, author = {Ljunglöf, Peter and Zechner, Niklas and Nieto Piña, Luis and Adesam, Yvonne and Borin, Lars}, year = {2019}, } @inProceedings{Adesam-Yvonne2019-279948, title = {Exploring the Quality of the Digital Historical Newspaper Archive KubHist}, abstract = {The KubHist Corpus is a massive corpus of Swedish historical newspapers, digitized by the Royal Swedish library, and available through the Språkbanken corpus infrastructure Korp. This paper contains a first overview of the KubHist corpus, exploring some of the difficulties with the data, such as OCR errors and spelling variation, and discussing possible paths for improving the quality and the searchability.}, booktitle = {Proceedings of the 4th Conference of The Association Digital Humanities in the Nordic Countries (DHN)}, author = {Adesam, Yvonne and Dannélls, Dana and Tahmasebi, Nina}, year = {2019}, } @techreport{Ljunglöf-Peter2019-281222, title = {Assessing the quality of Språkbanken’s annotations}, abstract = {Most of the corpora in Språkbanken Text consist of unannotated plain text, such as almost all newspaper texts, social media texts, novels and official documents. We also have some corpora that are manually annotated in different ways, such as Talbanken (annotated for part-of-speech and syntactic structure), and the Stockholm Umeå Corpus (annotated for part-of-speech). Språkbanken’s annotation pipeline Sparv aims to automatise the work of automatically annotating all our corpora, while still keeping the manual annotations intact. When all corpora are annotated, they can be made available, e.g., in the corpus searh tools Korp and Strix. Until now there has not been any comprehensive overview of the annotation tools and models that Sparv has been using for the last eight years. Some of them have not been updated since the start, such as the part-of-speech tagger Hunpos and the dependency parser MaltParser. There are also annotation tools that we still have not included, such as a constituency-based parser. Therefore Språkbanken initiated a project with the aim of conducting such an overview. This document is the outcome of that project, and it contains descriptions of the types of manual and automatic annotations that we currently have in Språkbanken, as well as an incomplete overview of the state-of-the-art with regards to annotation tools and models. }, author = {Ljunglöf, Peter and Zechner, Niklas and Nieto Piña, Luis and Adesam, Yvonne and Borin, Lars}, year = {2019}, } @techreport{Ljunglöf-Peter2019-281222, title = {Assessing the quality of Språkbanken’s annotations}, abstract = {Most of the corpora in Språkbanken Text consist of unannotated plain text, such as almost all newspaper texts, social media texts, novels and official documents. We also have some corpora that are manually annotated in different ways, such as Talbanken (annotated for part-of-speech and syntactic structure), and the Stockholm Umeå Corpus (annotated for part-of-speech). Språkbanken’s annotation pipeline Sparv aims to automatise the work of automatically annotating all our corpora, while still keeping the manual annotations intact. When all corpora are annotated, they can be made available, e.g., in the corpus searh tools Korp and Strix. Until now there has not been any comprehensive overview of the annotation tools and models that Sparv has been using for the last eight years. Some of them have not been updated since the start, such as the part-of-speech tagger Hunpos and the dependency parser MaltParser. There are also annotation tools that we still have not included, such as a constituency-based parser. Therefore Språkbanken initiated a project with the aim of conducting such an overview. This document is the outcome of that project, and it contains descriptions of the types of manual and automatic annotations that we currently have in Språkbanken, as well as an incomplete overview of the state-of-the-art with regards to annotation tools and models. }, author = {Ljunglöf, Peter and Zechner, Niklas and Nieto Piña, Luis and Adesam, Yvonne and Borin, Lars}, year = {2019}, } @inProceedings{Adesam-Yvonne2019-279948, title = {Exploring the Quality of the Digital Historical Newspaper Archive KubHist}, abstract = {The KubHist Corpus is a massive corpus of Swedish historical newspapers, digitized by the Royal Swedish library, and available through the Språkbanken corpus infrastructure Korp. This paper contains a first overview of the KubHist corpus, exploring some of the difficulties with the data, such as OCR errors and spelling variation, and discussing possible paths for improving the quality and the searchability.}, booktitle = {Proceedings of the 4th Conference of The Association Digital Humanities in the Nordic Countries (DHN)}, author = {Adesam, Yvonne and Dannélls, Dana and Tahmasebi, Nina}, year = {2019}, } @inProceedings{Adesam-Yvonne2019-279948, title = {Exploring the Quality of the Digital Historical Newspaper Archive KubHist}, abstract = {The KubHist Corpus is a massive corpus of Swedish historical newspapers, digitized by the Royal Swedish library, and available through the Språkbanken corpus infrastructure Korp. This paper contains a first overview of the KubHist corpus, exploring some of the difficulties with the data, such as OCR errors and spelling variation, and discussing possible paths for improving the quality and the searchability.}, booktitle = {Proceedings of the 4th Conference of The Association Digital Humanities in the Nordic Countries (DHN)}, author = {Adesam, Yvonne and Dannélls, Dana and Tahmasebi, Nina}, year = {2019}, } @techreport{Ljunglöf-Peter2019-281222, title = {Assessing the quality of Språkbanken’s annotations}, abstract = {Most of the corpora in Språkbanken Text consist of unannotated plain text, such as almost all newspaper texts, social media texts, novels and official documents. We also have some corpora that are manually annotated in different ways, such as Talbanken (annotated for part-of-speech and syntactic structure), and the Stockholm Umeå Corpus (annotated for part-of-speech). Språkbanken’s annotation pipeline Sparv aims to automatise the work of automatically annotating all our corpora, while still keeping the manual annotations intact. When all corpora are annotated, they can be made available, e.g., in the corpus searh tools Korp and Strix. Until now there has not been any comprehensive overview of the annotation tools and models that Sparv has been using for the last eight years. Some of them have not been updated since the start, such as the part-of-speech tagger Hunpos and the dependency parser MaltParser. There are also annotation tools that we still have not included, such as a constituency-based parser. Therefore Språkbanken initiated a project with the aim of conducting such an overview. This document is the outcome of that project, and it contains descriptions of the types of manual and automatic annotations that we currently have in Språkbanken, as well as an incomplete overview of the state-of-the-art with regards to annotation tools and models. }, author = {Ljunglöf, Peter and Zechner, Niklas and Nieto Piña, Luis and Adesam, Yvonne and Borin, Lars}, year = {2019}, } @inProceedings{Adesam-Yvonne2019-279948, title = {Exploring the Quality of the Digital Historical Newspaper Archive KubHist}, abstract = {The KubHist Corpus is a massive corpus of Swedish historical newspapers, digitized by the Royal Swedish library, and available through the Språkbanken corpus infrastructure Korp. This paper contains a first overview of the KubHist corpus, exploring some of the difficulties with the data, such as OCR errors and spelling variation, and discussing possible paths for improving the quality and the searchability.}, booktitle = {Proceedings of the 4th Conference of The Association Digital Humanities in the Nordic Countries (DHN)}, author = {Adesam, Yvonne and Dannélls, Dana and Tahmasebi, Nina}, year = {2019}, } @techreport{Ljunglöf-Peter2019-281222, title = {Assessing the quality of Språkbanken’s annotations}, abstract = {Most of the corpora in Språkbanken Text consist of unannotated plain text, such as almost all newspaper texts, social media texts, novels and official documents. We also have some corpora that are manually annotated in different ways, such as Talbanken (annotated for part-of-speech and syntactic structure), and the Stockholm Umeå Corpus (annotated for part-of-speech). Språkbanken’s annotation pipeline Sparv aims to automatise the work of automatically annotating all our corpora, while still keeping the manual annotations intact. When all corpora are annotated, they can be made available, e.g., in the corpus searh tools Korp and Strix. Until now there has not been any comprehensive overview of the annotation tools and models that Sparv has been using for the last eight years. Some of them have not been updated since the start, such as the part-of-speech tagger Hunpos and the dependency parser MaltParser. There are also annotation tools that we still have not included, such as a constituency-based parser. Therefore Språkbanken initiated a project with the aim of conducting such an overview. This document is the outcome of that project, and it contains descriptions of the types of manual and automatic annotations that we currently have in Språkbanken, as well as an incomplete overview of the state-of-the-art with regards to annotation tools and models. }, author = {Ljunglöf, Peter and Zechner, Niklas and Nieto Piña, Luis and Adesam, Yvonne and Borin, Lars}, year = {2019}, } @inProceedings{Adesam-Yvonne2019-279948, title = {Exploring the Quality of the Digital Historical Newspaper Archive KubHist}, abstract = {The KubHist Corpus is a massive corpus of Swedish historical newspapers, digitized by the Royal Swedish library, and available through the Språkbanken corpus infrastructure Korp. This paper contains a first overview of the KubHist corpus, exploring some of the difficulties with the data, such as OCR errors and spelling variation, and discussing possible paths for improving the quality and the searchability.}, booktitle = {Proceedings of the 4th Conference of The Association Digital Humanities in the Nordic Countries (DHN)}, author = {Adesam, Yvonne and Dannélls, Dana and Tahmasebi, Nina}, year = {2019}, } @techreport{Ljunglöf-Peter2019-281222, title = {Assessing the quality of Språkbanken’s annotations}, abstract = {Most of the corpora in Språkbanken Text consist of unannotated plain text, such as almost all newspaper texts, social media texts, novels and official documents. We also have some corpora that are manually annotated in different ways, such as Talbanken (annotated for part-of-speech and syntactic structure), and the Stockholm Umeå Corpus (annotated for part-of-speech). Språkbanken’s annotation pipeline Sparv aims to automatise the work of automatically annotating all our corpora, while still keeping the manual annotations intact. When all corpora are annotated, they can be made available, e.g., in the corpus searh tools Korp and Strix. Until now there has not been any comprehensive overview of the annotation tools and models that Sparv has been using for the last eight years. Some of them have not been updated since the start, such as the part-of-speech tagger Hunpos and the dependency parser MaltParser. There are also annotation tools that we still have not included, such as a constituency-based parser. Therefore Språkbanken initiated a project with the aim of conducting such an overview. This document is the outcome of that project, and it contains descriptions of the types of manual and automatic annotations that we currently have in Språkbanken, as well as an incomplete overview of the state-of-the-art with regards to annotation tools and models. }, author = {Ljunglöf, Peter and Zechner, Niklas and Nieto Piña, Luis and Adesam, Yvonne and Borin, Lars}, year = {2019}, } @inProceedings{Adesam-Yvonne2019-279948, title = {Exploring the Quality of the Digital Historical Newspaper Archive KubHist}, abstract = {The KubHist Corpus is a massive corpus of Swedish historical newspapers, digitized by the Royal Swedish library, and available through the Språkbanken corpus infrastructure Korp. This paper contains a first overview of the KubHist corpus, exploring some of the difficulties with the data, such as OCR errors and spelling variation, and discussing possible paths for improving the quality and the searchability.}, booktitle = {Proceedings of the 4th Conference of The Association Digital Humanities in the Nordic Countries (DHN)}, author = {Adesam, Yvonne and Dannélls, Dana and Tahmasebi, Nina}, year = {2019}, } @techreport{Ljunglöf-Peter2019-281222, title = {Assessing the quality of Språkbanken’s annotations}, abstract = {Most of the corpora in Språkbanken Text consist of unannotated plain text, such as almost all newspaper texts, social media texts, novels and official documents. We also have some corpora that are manually annotated in different ways, such as Talbanken (annotated for part-of-speech and syntactic structure), and the Stockholm Umeå Corpus (annotated for part-of-speech). Språkbanken’s annotation pipeline Sparv aims to automatise the work of automatically annotating all our corpora, while still keeping the manual annotations intact. When all corpora are annotated, they can be made available, e.g., in the corpus searh tools Korp and Strix. Until now there has not been any comprehensive overview of the annotation tools and models that Sparv has been using for the last eight years. Some of them have not been updated since the start, such as the part-of-speech tagger Hunpos and the dependency parser MaltParser. There are also annotation tools that we still have not included, such as a constituency-based parser. Therefore Språkbanken initiated a project with the aim of conducting such an overview. This document is the outcome of that project, and it contains descriptions of the types of manual and automatic annotations that we currently have in Språkbanken, as well as an incomplete overview of the state-of-the-art with regards to annotation tools and models. }, author = {Ljunglöf, Peter and Zechner, Niklas and Nieto Piña, Luis and Adesam, Yvonne and Borin, Lars}, year = {2019}, } @inProceedings{Adesam-Yvonne2019-279948, title = {Exploring the Quality of the Digital Historical Newspaper Archive KubHist}, abstract = {The KubHist Corpus is a massive corpus of Swedish historical newspapers, digitized by the Royal Swedish library, and available through the Språkbanken corpus infrastructure Korp. This paper contains a first overview of the KubHist corpus, exploring some of the difficulties with the data, such as OCR errors and spelling variation, and discussing possible paths for improving the quality and the searchability.}, booktitle = {Proceedings of the 4th Conference of The Association Digital Humanities in the Nordic Countries (DHN)}, author = {Adesam, Yvonne and Dannélls, Dana and Tahmasebi, Nina}, year = {2019}, } @techreport{Ljunglöf-Peter2019-281222, title = {Assessing the quality of Språkbanken’s annotations}, abstract = {Most of the corpora in Språkbanken Text consist of unannotated plain text, such as almost all newspaper texts, social media texts, novels and official documents. We also have some corpora that are manually annotated in different ways, such as Talbanken (annotated for part-of-speech and syntactic structure), and the Stockholm Umeå Corpus (annotated for part-of-speech). Språkbanken’s annotation pipeline Sparv aims to automatise the work of automatically annotating all our corpora, while still keeping the manual annotations intact. When all corpora are annotated, they can be made available, e.g., in the corpus searh tools Korp and Strix. Until now there has not been any comprehensive overview of the annotation tools and models that Sparv has been using for the last eight years. Some of them have not been updated since the start, such as the part-of-speech tagger Hunpos and the dependency parser MaltParser. There are also annotation tools that we still have not included, such as a constituency-based parser. Therefore Språkbanken initiated a project with the aim of conducting such an overview. This document is the outcome of that project, and it contains descriptions of the types of manual and automatic annotations that we currently have in Språkbanken, as well as an incomplete overview of the state-of-the-art with regards to annotation tools and models. }, author = {Ljunglöf, Peter and Zechner, Niklas and Nieto Piña, Luis and Adesam, Yvonne and Borin, Lars}, year = {2019}, } @inProceedings{Adesam-Yvonne2019-279948, title = {Exploring the Quality of the Digital Historical Newspaper Archive KubHist}, abstract = {The KubHist Corpus is a massive corpus of Swedish historical newspapers, digitized by the Royal Swedish library, and available through the Språkbanken corpus infrastructure Korp. This paper contains a first overview of the KubHist corpus, exploring some of the difficulties with the data, such as OCR errors and spelling variation, and discussing possible paths for improving the quality and the searchability.}, booktitle = {Proceedings of the 4th Conference of The Association Digital Humanities in the Nordic Countries (DHN)}, author = {Adesam, Yvonne and Dannélls, Dana and Tahmasebi, Nina}, year = {2019}, } @techreport{Ljunglöf-Peter2019-281222, title = {Assessing the quality of Språkbanken’s annotations}, abstract = {Most of the corpora in Språkbanken Text consist of unannotated plain text, such as almost all newspaper texts, social media texts, novels and official documents. We also have some corpora that are manually annotated in different ways, such as Talbanken (annotated for part-of-speech and syntactic structure), and the Stockholm Umeå Corpus (annotated for part-of-speech). Språkbanken’s annotation pipeline Sparv aims to automatise the work of automatically annotating all our corpora, while still keeping the manual annotations intact. When all corpora are annotated, they can be made available, e.g., in the corpus searh tools Korp and Strix. Until now there has not been any comprehensive overview of the annotation tools and models that Sparv has been using for the last eight years. Some of them have not been updated since the start, such as the part-of-speech tagger Hunpos and the dependency parser MaltParser. There are also annotation tools that we still have not included, such as a constituency-based parser. Therefore Språkbanken initiated a project with the aim of conducting such an overview. This document is the outcome of that project, and it contains descriptions of the types of manual and automatic annotations that we currently have in Språkbanken, as well as an incomplete overview of the state-of-the-art with regards to annotation tools and models. }, author = {Ljunglöf, Peter and Zechner, Niklas and Nieto Piña, Luis and Adesam, Yvonne and Borin, Lars}, year = {2019}, } @inProceedings{Adesam-Yvonne2019-279948, title = {Exploring the Quality of the Digital Historical Newspaper Archive KubHist}, abstract = {The KubHist Corpus is a massive corpus of Swedish historical newspapers, digitized by the Royal Swedish library, and available through the Språkbanken corpus infrastructure Korp. This paper contains a first overview of the KubHist corpus, exploring some of the difficulties with the data, such as OCR errors and spelling variation, and discussing possible paths for improving the quality and the searchability.}, booktitle = {Proceedings of the 4th Conference of The Association Digital Humanities in the Nordic Countries (DHN)}, author = {Adesam, Yvonne and Dannélls, Dana and Tahmasebi, Nina}, year = {2019}, } @techreport{Ljunglöf-Peter2019-281222, title = {Assessing the quality of Språkbanken’s annotations}, abstract = {Most of the corpora in Språkbanken Text consist of unannotated plain text, such as almost all newspaper texts, social media texts, novels and official documents. We also have some corpora that are manually annotated in different ways, such as Talbanken (annotated for part-of-speech and syntactic structure), and the Stockholm Umeå Corpus (annotated for part-of-speech). Språkbanken’s annotation pipeline Sparv aims to automatise the work of automatically annotating all our corpora, while still keeping the manual annotations intact. When all corpora are annotated, they can be made available, e.g., in the corpus searh tools Korp and Strix. Until now there has not been any comprehensive overview of the annotation tools and models that Sparv has been using for the last eight years. Some of them have not been updated since the start, such as the part-of-speech tagger Hunpos and the dependency parser MaltParser. There are also annotation tools that we still have not included, such as a constituency-based parser. Therefore Språkbanken initiated a project with the aim of conducting such an overview. This document is the outcome of that project, and it contains descriptions of the types of manual and automatic annotations that we currently have in Språkbanken, as well as an incomplete overview of the state-of-the-art with regards to annotation tools and models. }, author = {Ljunglöf, Peter and Zechner, Niklas and Nieto Piña, Luis and Adesam, Yvonne and Borin, Lars}, year = {2019}, } @inProceedings{Adesam-Yvonne2019-279948, title = {Exploring the Quality of the Digital Historical Newspaper Archive KubHist}, abstract = {The KubHist Corpus is a massive corpus of Swedish historical newspapers, digitized by the Royal Swedish library, and available through the Språkbanken corpus infrastructure Korp. This paper contains a first overview of the KubHist corpus, exploring some of the difficulties with the data, such as OCR errors and spelling variation, and discussing possible paths for improving the quality and the searchability.}, booktitle = {Proceedings of the 4th Conference of The Association Digital Humanities in the Nordic Countries (DHN)}, author = {Adesam, Yvonne and Dannélls, Dana and Tahmasebi, Nina}, year = {2019}, } @techreport{Ljunglöf-Peter2019-281222, title = {Assessing the quality of Språkbanken’s annotations}, abstract = {Most of the corpora in Språkbanken Text consist of unannotated plain text, such as almost all newspaper texts, social media texts, novels and official documents. We also have some corpora that are manually annotated in different ways, such as Talbanken (annotated for part-of-speech and syntactic structure), and the Stockholm Umeå Corpus (annotated for part-of-speech). Språkbanken’s annotation pipeline Sparv aims to automatise the work of automatically annotating all our corpora, while still keeping the manual annotations intact. When all corpora are annotated, they can be made available, e.g., in the corpus searh tools Korp and Strix. Until now there has not been any comprehensive overview of the annotation tools and models that Sparv has been using for the last eight years. Some of them have not been updated since the start, such as the part-of-speech tagger Hunpos and the dependency parser MaltParser. There are also annotation tools that we still have not included, such as a constituency-based parser. Therefore Språkbanken initiated a project with the aim of conducting such an overview. This document is the outcome of that project, and it contains descriptions of the types of manual and automatic annotations that we currently have in Språkbanken, as well as an incomplete overview of the state-of-the-art with regards to annotation tools and models. }, author = {Ljunglöf, Peter and Zechner, Niklas and Nieto Piña, Luis and Adesam, Yvonne and Borin, Lars}, year = {2019}, } @inProceedings{Adesam-Yvonne2019-279948, title = {Exploring the Quality of the Digital Historical Newspaper Archive KubHist}, abstract = {The KubHist Corpus is a massive corpus of Swedish historical newspapers, digitized by the Royal Swedish library, and available through the Språkbanken corpus infrastructure Korp. This paper contains a first overview of the KubHist corpus, exploring some of the difficulties with the data, such as OCR errors and spelling variation, and discussing possible paths for improving the quality and the searchability.}, booktitle = {Proceedings of the 4th Conference of The Association Digital Humanities in the Nordic Countries (DHN)}, author = {Adesam, Yvonne and Dannélls, Dana and Tahmasebi, Nina}, year = {2019}, } @techreport{Ljunglöf-Peter2019-281222, title = {Assessing the quality of Språkbanken’s annotations}, abstract = {Most of the corpora in Språkbanken Text consist of unannotated plain text, such as almost all newspaper texts, social media texts, novels and official documents. We also have some corpora that are manually annotated in different ways, such as Talbanken (annotated for part-of-speech and syntactic structure), and the Stockholm Umeå Corpus (annotated for part-of-speech). Språkbanken’s annotation pipeline Sparv aims to automatise the work of automatically annotating all our corpora, while still keeping the manual annotations intact. When all corpora are annotated, they can be made available, e.g., in the corpus searh tools Korp and Strix. Until now there has not been any comprehensive overview of the annotation tools and models that Sparv has been using for the last eight years. Some of them have not been updated since the start, such as the part-of-speech tagger Hunpos and the dependency parser MaltParser. There are also annotation tools that we still have not included, such as a constituency-based parser. Therefore Språkbanken initiated a project with the aim of conducting such an overview. This document is the outcome of that project, and it contains descriptions of the types of manual and automatic annotations that we currently have in Språkbanken, as well as an incomplete overview of the state-of-the-art with regards to annotation tools and models. }, author = {Ljunglöf, Peter and Zechner, Niklas and Nieto Piña, Luis and Adesam, Yvonne and Borin, Lars}, year = {2019}, } @inProceedings{Adesam-Yvonne2019-279948, title = {Exploring the Quality of the Digital Historical Newspaper Archive KubHist}, abstract = {The KubHist Corpus is a massive corpus of Swedish historical newspapers, digitized by the Royal Swedish library, and available through the Språkbanken corpus infrastructure Korp. This paper contains a first overview of the KubHist corpus, exploring some of the difficulties with the data, such as OCR errors and spelling variation, and discussing possible paths for improving the quality and the searchability.}, booktitle = {Proceedings of the 4th Conference of The Association Digital Humanities in the Nordic Countries (DHN)}, author = {Adesam, Yvonne and Dannélls, Dana and Tahmasebi, Nina}, year = {2019}, } @inProceedings{Adesam-Yvonne2018-267311, title = {FSvReader – Exploring Old Swedish Cultural Heritage Texts}, abstract = {This paper describes FSvReader, a tool for easier access to Old Swedish (13th–16th century) texts. Through automatic fuzzy linking of words in a text to a dictionary describing the language of the time, the reader has direct access to dictionary pop-up definitions, in spite of the large amount of morphological and spelling variation. The linked dictionary entries can also be used for simple searches in the text, highlighting possible further instances of the same entry. }, booktitle = {CEUR Workshop Proceedings, vol. 2084. Proceedings of the Digital Humanities in the Nordic Countries 3rd Conference Helsinki, Finland, March 7-9, 2018. Edited by Eetu, Mäkelä Mikko, Tolonen Jouni Tuominen}, author = {Adesam, Yvonne and Ahlberg, Malin and Bouma, Gerlof}, year = {2018}, publisher = {University of Helsinki, Faculty of Arts}, adress = {Helsinki}, } @inProceedings{Adesam-Yvonne2018-279802, title = {Exploring the Quality of the Digital Historical Newspaper Archive KubHist}, booktitle = {Abstracts of the Swedish Language Technology Conference (SLTC), October 7-9, 2018, Stockholm}, author = {Adesam, Yvonne and Dannélls, Dana and Tahmasebi, Nina}, year = {2018}, } @inProceedings{Adesam-Yvonne2018-273841, title = {The Koala Part-of-Speech and Morphological Tagset for Swedish}, booktitle = {Seventh Swedish Language Technology Conference (SLTC), Stockholm, 7-9 November, 2018}, author = {Adesam, Yvonne and Bouma, Gerlof and Johansson, Richard}, year = {2018}, } @inProceedings{Adesam-Yvonne2018-273839, title = {The Eukalyptus Treebank of Written Swedish}, booktitle = {Seventh Swedish Language Technology Conference (SLTC), Stockholm, 7–9 November 2018}, author = {Adesam, Yvonne and Bouma, Gerlof and Johansson, Richard and Borin, Lars and Forsberg, Markus}, year = {2018}, } @inProceedings{Adesam-Yvonne2018-273835, title = {Exploring the Quality of the Digital Historical Newspaper Archive KubHist}, booktitle = {Seventh Swedish Language Technology Conference (SLTC), Stockholm, 7-9 November 2018}, author = {Adesam, Yvonne and Dannélls, Dana and Tahmasebi, Nina}, year = {2018}, } @inProceedings{Adesam-Yvonne2018-279802, title = {Exploring the Quality of the Digital Historical Newspaper Archive KubHist}, booktitle = {Abstracts of the Swedish Language Technology Conference (SLTC), October 7-9, 2018, Stockholm}, author = {Adesam, Yvonne and Dannélls, Dana and Tahmasebi, Nina}, year = {2018}, } @inProceedings{Adesam-Yvonne2018-273841, title = {The Koala Part-of-Speech and Morphological Tagset for Swedish}, booktitle = {Seventh Swedish Language Technology Conference (SLTC), Stockholm, 7-9 November, 2018}, author = {Adesam, Yvonne and Bouma, Gerlof and Johansson, Richard}, year = {2018}, } @inProceedings{Adesam-Yvonne2018-273839, title = {The Eukalyptus Treebank of Written Swedish}, booktitle = {Seventh Swedish Language Technology Conference (SLTC), Stockholm, 7–9 November 2018}, author = {Adesam, Yvonne and Bouma, Gerlof and Johansson, Richard and Borin, Lars and Forsberg, Markus}, year = {2018}, } @inProceedings{Adesam-Yvonne2018-273835, title = {Exploring the Quality of the Digital Historical Newspaper Archive KubHist}, booktitle = {Seventh Swedish Language Technology Conference (SLTC), Stockholm, 7-9 November 2018}, author = {Adesam, Yvonne and Dannélls, Dana and Tahmasebi, Nina}, year = {2018}, } @inProceedings{Adesam-Yvonne2018-279802, title = {Exploring the Quality of the Digital Historical Newspaper Archive KubHist}, booktitle = {Abstracts of the Swedish Language Technology Conference (SLTC), October 7-9, 2018, Stockholm}, author = {Adesam, Yvonne and Dannélls, Dana and Tahmasebi, Nina}, year = {2018}, } @inProceedings{Adesam-Yvonne2018-273841, title = {The Koala Part-of-Speech and Morphological Tagset for Swedish}, booktitle = {Seventh Swedish Language Technology Conference (SLTC), Stockholm, 7-9 November, 2018}, author = {Adesam, Yvonne and Bouma, Gerlof and Johansson, Richard}, year = {2018}, } @inProceedings{Adesam-Yvonne2018-273839, title = {The Eukalyptus Treebank of Written Swedish}, booktitle = {Seventh Swedish Language Technology Conference (SLTC), Stockholm, 7–9 November 2018}, author = {Adesam, Yvonne and Bouma, Gerlof and Johansson, Richard and Borin, Lars and Forsberg, Markus}, year = {2018}, } @inProceedings{Adesam-Yvonne2018-273835, title = {Exploring the Quality of the Digital Historical Newspaper Archive KubHist}, booktitle = {Seventh Swedish Language Technology Conference (SLTC), Stockholm, 7-9 November 2018}, author = {Adesam, Yvonne and Dannélls, Dana and Tahmasebi, Nina}, year = {2018}, } @inProceedings{Adesam-Yvonne2018-279802, title = {Exploring the Quality of the Digital Historical Newspaper Archive KubHist}, booktitle = {Abstracts of the Swedish Language Technology Conference (SLTC), October 7-9, 2018, Stockholm}, author = {Adesam, Yvonne and Dannélls, Dana and Tahmasebi, Nina}, year = {2018}, } @inProceedings{Adesam-Yvonne2018-273841, title = {The Koala Part-of-Speech and Morphological Tagset for Swedish}, booktitle = {Seventh Swedish Language Technology Conference (SLTC), Stockholm, 7-9 November, 2018}, author = {Adesam, Yvonne and Bouma, Gerlof and Johansson, Richard}, year = {2018}, } @inProceedings{Adesam-Yvonne2018-273839, title = {The Eukalyptus Treebank of Written Swedish}, booktitle = {Seventh Swedish Language Technology Conference (SLTC), Stockholm, 7–9 November 2018}, author = {Adesam, Yvonne and Bouma, Gerlof and Johansson, Richard and Borin, Lars and Forsberg, Markus}, year = {2018}, } @inProceedings{Adesam-Yvonne2018-273835, title = {Exploring the Quality of the Digital Historical Newspaper Archive KubHist}, booktitle = {Seventh Swedish Language Technology Conference (SLTC), Stockholm, 7-9 November 2018}, author = {Adesam, Yvonne and Dannélls, Dana and Tahmasebi, Nina}, year = {2018}, } @inProceedings{Adesam-Yvonne2018-279802, title = {Exploring the Quality of the Digital Historical Newspaper Archive KubHist}, booktitle = {Abstracts of the Swedish Language Technology Conference (SLTC), October 7-9, 2018, Stockholm}, author = {Adesam, Yvonne and Dannélls, Dana and Tahmasebi, Nina}, year = {2018}, } @inProceedings{Adesam-Yvonne2018-273841, title = {The Koala Part-of-Speech and Morphological Tagset for Swedish}, booktitle = {Seventh Swedish Language Technology Conference (SLTC), Stockholm, 7-9 November, 2018}, author = {Adesam, Yvonne and Bouma, Gerlof and Johansson, Richard}, year = {2018}, } @inProceedings{Adesam-Yvonne2018-273839, title = {The Eukalyptus Treebank of Written Swedish}, booktitle = {Seventh Swedish Language Technology Conference (SLTC), Stockholm, 7–9 November 2018}, author = {Adesam, Yvonne and Bouma, Gerlof and Johansson, Richard and Borin, Lars and Forsberg, Markus}, year = {2018}, } @inProceedings{Adesam-Yvonne2018-273835, title = {Exploring the Quality of the Digital Historical Newspaper Archive KubHist}, booktitle = {Seventh Swedish Language Technology Conference (SLTC), Stockholm, 7-9 November 2018}, author = {Adesam, Yvonne and Dannélls, Dana and Tahmasebi, Nina}, year = {2018}, } @inProceedings{Adesam-Yvonne2018-279802, title = {Exploring the Quality of the Digital Historical Newspaper Archive KubHist}, booktitle = {Abstracts of the Swedish Language Technology Conference (SLTC), October 7-9, 2018, Stockholm}, author = {Adesam, Yvonne and Dannélls, Dana and Tahmasebi, Nina}, year = {2018}, } @inProceedings{Adesam-Yvonne2018-273841, title = {The Koala Part-of-Speech and Morphological Tagset for Swedish}, booktitle = {Seventh Swedish Language Technology Conference (SLTC), Stockholm, 7-9 November, 2018}, author = {Adesam, Yvonne and Bouma, Gerlof and Johansson, Richard}, year = {2018}, } @inProceedings{Adesam-Yvonne2018-273839, title = {The Eukalyptus Treebank of Written Swedish}, booktitle = {Seventh Swedish Language Technology Conference (SLTC), Stockholm, 7–9 November 2018}, author = {Adesam, Yvonne and Bouma, Gerlof and Johansson, Richard and Borin, Lars and Forsberg, Markus}, year = {2018}, } @inProceedings{Adesam-Yvonne2018-273835, title = {Exploring the Quality of the Digital Historical Newspaper Archive KubHist}, booktitle = {Seventh Swedish Language Technology Conference (SLTC), Stockholm, 7-9 November 2018}, author = {Adesam, Yvonne and Dannélls, Dana and Tahmasebi, Nina}, year = {2018}, } @inProceedings{Adesam-Yvonne2018-279802, title = {Exploring the Quality of the Digital Historical Newspaper Archive KubHist}, booktitle = {Abstracts of the Swedish Language Technology Conference (SLTC), October 7-9, 2018, Stockholm}, author = {Adesam, Yvonne and Dannélls, Dana and Tahmasebi, Nina}, year = {2018}, } @inProceedings{Adesam-Yvonne2018-279802, title = {Exploring the Quality of the Digital Historical Newspaper Archive KubHist}, booktitle = {Abstracts of the Swedish Language Technology Conference (SLTC), October 7-9, 2018, Stockholm}, author = {Adesam, Yvonne and Dannélls, Dana and Tahmasebi, Nina}, year = {2018}, } @inProceedings{Adesam-Yvonne2018-273841, title = {The Koala Part-of-Speech and Morphological Tagset for Swedish}, booktitle = {Seventh Swedish Language Technology Conference (SLTC), Stockholm, 7-9 November, 2018}, author = {Adesam, Yvonne and Bouma, Gerlof and Johansson, Richard}, year = {2018}, } @inProceedings{Adesam-Yvonne2018-273841, title = {The Koala Part-of-Speech and Morphological Tagset for Swedish}, booktitle = {Seventh Swedish Language Technology Conference (SLTC), Stockholm, 7-9 November, 2018}, author = {Adesam, Yvonne and Bouma, Gerlof and Johansson, Richard}, year = {2018}, } @inProceedings{Adesam-Yvonne2018-273839, title = {The Eukalyptus Treebank of Written Swedish}, booktitle = {Seventh Swedish Language Technology Conference (SLTC), Stockholm, 7–9 November 2018}, author = {Adesam, Yvonne and Bouma, Gerlof and Johansson, Richard and Borin, Lars and Forsberg, Markus}, year = {2018}, } @inProceedings{Adesam-Yvonne2018-273839, title = {The Eukalyptus Treebank of Written Swedish}, booktitle = {Seventh Swedish Language Technology Conference (SLTC), Stockholm, 7–9 November 2018}, author = {Adesam, Yvonne and Bouma, Gerlof and Johansson, Richard and Borin, Lars and Forsberg, Markus}, year = {2018}, } @inProceedings{Adesam-Yvonne2018-273835, title = {Exploring the Quality of the Digital Historical Newspaper Archive KubHist}, booktitle = {Seventh Swedish Language Technology Conference (SLTC), Stockholm, 7-9 November 2018}, author = {Adesam, Yvonne and Dannélls, Dana and Tahmasebi, Nina}, year = {2018}, } @inProceedings{Adesam-Yvonne2018-273835, title = {Exploring the Quality of the Digital Historical Newspaper Archive KubHist}, booktitle = {Seventh Swedish Language Technology Conference (SLTC), Stockholm, 7-9 November 2018}, author = {Adesam, Yvonne and Dannélls, Dana and Tahmasebi, Nina}, year = {2018}, } @inProceedings{Adesam-Yvonne2018-279802, title = {Exploring the Quality of the Digital Historical Newspaper Archive KubHist}, booktitle = {Abstracts of the Swedish Language Technology Conference (SLTC), October 7-9, 2018, Stockholm}, author = {Adesam, Yvonne and Dannélls, Dana and Tahmasebi, Nina}, year = {2018}, } @inProceedings{Adesam-Yvonne2018-273841, title = {The Koala Part-of-Speech and Morphological Tagset for Swedish}, booktitle = {Seventh Swedish Language Technology Conference (SLTC), Stockholm, 7-9 November, 2018}, author = {Adesam, Yvonne and Bouma, Gerlof and Johansson, Richard}, year = {2018}, } @inProceedings{Adesam-Yvonne2018-279802, title = {Exploring the Quality of the Digital Historical Newspaper Archive KubHist}, booktitle = {Abstracts of the Swedish Language Technology Conference (SLTC), October 7-9, 2018, Stockholm}, author = {Adesam, Yvonne and Dannélls, Dana and Tahmasebi, Nina}, year = {2018}, } @inProceedings{Adesam-Yvonne2018-273839, title = {The Eukalyptus Treebank of Written Swedish}, booktitle = {Seventh Swedish Language Technology Conference (SLTC), Stockholm, 7–9 November 2018}, author = {Adesam, Yvonne and Bouma, Gerlof and Johansson, Richard and Borin, Lars and Forsberg, Markus}, year = {2018}, } @inProceedings{Adesam-Yvonne2018-273835, title = {Exploring the Quality of the Digital Historical Newspaper Archive KubHist}, booktitle = {Seventh Swedish Language Technology Conference (SLTC), Stockholm, 7-9 November 2018}, author = {Adesam, Yvonne and Dannélls, Dana and Tahmasebi, Nina}, year = {2018}, } @inProceedings{Adesam-Yvonne2018-279802, title = {Exploring the Quality of the Digital Historical Newspaper Archive KubHist}, booktitle = {Abstracts of the Swedish Language Technology Conference (SLTC), October 7-9, 2018, Stockholm}, author = {Adesam, Yvonne and Dannélls, Dana and Tahmasebi, Nina}, year = {2018}, } @inProceedings{Adesam-Yvonne2018-273841, title = {The Koala Part-of-Speech and Morphological Tagset for Swedish}, booktitle = {Seventh Swedish Language Technology Conference (SLTC), Stockholm, 7-9 November, 2018}, author = {Adesam, Yvonne and Bouma, Gerlof and Johansson, Richard}, year = {2018}, } @inProceedings{Adesam-Yvonne2018-273839, title = {The Eukalyptus Treebank of Written Swedish}, booktitle = {Seventh Swedish Language Technology Conference (SLTC), Stockholm, 7–9 November 2018}, author = {Adesam, Yvonne and Bouma, Gerlof and Johansson, Richard and Borin, Lars and Forsberg, Markus}, year = {2018}, } @inProceedings{Adesam-Yvonne2018-273835, title = {Exploring the Quality of the Digital Historical Newspaper Archive KubHist}, booktitle = {Seventh Swedish Language Technology Conference (SLTC), Stockholm, 7-9 November 2018}, author = {Adesam, Yvonne and Dannélls, Dana and Tahmasebi, Nina}, year = {2018}, } @inProceedings{Adesam-Yvonne2018-279802, title = {Exploring the Quality of the Digital Historical Newspaper Archive KubHist}, booktitle = {Abstracts of the Swedish Language Technology Conference (SLTC), October 7-9, 2018, Stockholm}, author = {Adesam, Yvonne and Dannélls, Dana and Tahmasebi, Nina}, year = {2018}, } @inProceedings{Adesam-Yvonne2018-273841, title = {The Koala Part-of-Speech and Morphological Tagset for Swedish}, booktitle = {Seventh Swedish Language Technology Conference (SLTC), Stockholm, 7-9 November, 2018}, author = {Adesam, Yvonne and Bouma, Gerlof and Johansson, Richard}, year = {2018}, } @inProceedings{Adesam-Yvonne2018-273839, title = {The Eukalyptus Treebank of Written Swedish}, booktitle = {Seventh Swedish Language Technology Conference (SLTC), Stockholm, 7–9 November 2018}, author = {Adesam, Yvonne and Bouma, Gerlof and Johansson, Richard and Borin, Lars and Forsberg, Markus}, year = {2018}, } @inProceedings{Adesam-Yvonne2018-273835, title = {Exploring the Quality of the Digital Historical Newspaper Archive KubHist}, booktitle = {Seventh Swedish Language Technology Conference (SLTC), Stockholm, 7-9 November 2018}, author = {Adesam, Yvonne and Dannélls, Dana and Tahmasebi, Nina}, year = {2018}, } @inProceedings{Adesam-Yvonne2018-273841, title = {The Koala Part-of-Speech and Morphological Tagset for Swedish}, booktitle = {Seventh Swedish Language Technology Conference (SLTC), Stockholm, 7-9 November, 2018}, author = {Adesam, Yvonne and Bouma, Gerlof and Johansson, Richard}, year = {2018}, } @inProceedings{Adesam-Yvonne2018-273839, title = {The Eukalyptus Treebank of Written Swedish}, booktitle = {Seventh Swedish Language Technology Conference (SLTC), Stockholm, 7–9 November 2018}, author = {Adesam, Yvonne and Bouma, Gerlof and Johansson, Richard and Borin, Lars and Forsberg, Markus}, year = {2018}, } @inProceedings{Adesam-Yvonne2018-273835, title = {Exploring the Quality of the Digital Historical Newspaper Archive KubHist}, booktitle = {Seventh Swedish Language Technology Conference (SLTC), Stockholm, 7-9 November 2018}, author = {Adesam, Yvonne and Dannélls, Dana and Tahmasebi, Nina}, year = {2018}, } @inProceedings{Adesam-Yvonne2018-279802, title = {Exploring the Quality of the Digital Historical Newspaper Archive KubHist}, booktitle = {Abstracts of the Swedish Language Technology Conference (SLTC), October 7-9, 2018, Stockholm}, author = {Adesam, Yvonne and Dannélls, Dana and Tahmasebi, Nina}, year = {2018}, } @inProceedings{Adesam-Yvonne2018-279802, title = {Exploring the Quality of the Digital Historical Newspaper Archive KubHist}, booktitle = {Abstracts of the Swedish Language Technology Conference (SLTC), October 7-9, 2018, Stockholm}, author = {Adesam, Yvonne and Dannélls, Dana and Tahmasebi, Nina}, year = {2018}, } @inProceedings{Adesam-Yvonne2018-273841, title = {The Koala Part-of-Speech and Morphological Tagset for Swedish}, booktitle = {Seventh Swedish Language Technology Conference (SLTC), Stockholm, 7-9 November, 2018}, author = {Adesam, Yvonne and Bouma, Gerlof and Johansson, Richard}, year = {2018}, } @inProceedings{Adesam-Yvonne2018-273839, title = {The Eukalyptus Treebank of Written Swedish}, booktitle = {Seventh Swedish Language Technology Conference (SLTC), Stockholm, 7–9 November 2018}, author = {Adesam, Yvonne and Bouma, Gerlof and Johansson, Richard and Borin, Lars and Forsberg, Markus}, year = {2018}, } @inProceedings{Adesam-Yvonne2018-273835, title = {Exploring the Quality of the Digital Historical Newspaper Archive KubHist}, booktitle = {Seventh Swedish Language Technology Conference (SLTC), Stockholm, 7-9 November 2018}, author = {Adesam, Yvonne and Dannélls, Dana and Tahmasebi, Nina}, year = {2018}, } @inProceedings{Adesam-Yvonne2018-273841, title = {The Koala Part-of-Speech and Morphological Tagset for Swedish}, booktitle = {Seventh Swedish Language Technology Conference (SLTC), Stockholm, 7-9 November, 2018}, author = {Adesam, Yvonne and Bouma, Gerlof and Johansson, Richard}, year = {2018}, } @inProceedings{Adesam-Yvonne2018-273839, title = {The Eukalyptus Treebank of Written Swedish}, booktitle = {Seventh Swedish Language Technology Conference (SLTC), Stockholm, 7–9 November 2018}, author = {Adesam, Yvonne and Bouma, Gerlof and Johansson, Richard and Borin, Lars and Forsberg, Markus}, year = {2018}, } @inProceedings{Adesam-Yvonne2018-273835, title = {Exploring the Quality of the Digital Historical Newspaper Archive KubHist}, booktitle = {Seventh Swedish Language Technology Conference (SLTC), Stockholm, 7-9 November 2018}, author = {Adesam, Yvonne and Dannélls, Dana and Tahmasebi, Nina}, year = {2018}, } @inProceedings{Adesam-Yvonne2018-279802, title = {Exploring the Quality of the Digital Historical Newspaper Archive KubHist}, booktitle = {Abstracts of the Swedish Language Technology Conference (SLTC), October 7-9, 2018, Stockholm}, author = {Adesam, Yvonne and Dannélls, Dana and Tahmasebi, Nina}, year = {2018}, } @inProceedings{Adesam-Yvonne2018-279802, title = {Exploring the Quality of the Digital Historical Newspaper Archive KubHist}, booktitle = {Abstracts of the Swedish Language Technology Conference (SLTC), October 7-9, 2018, Stockholm}, author = {Adesam, Yvonne and Dannélls, Dana and Tahmasebi, Nina}, year = {2018}, } @inProceedings{Adesam-Yvonne2018-273841, title = {The Koala Part-of-Speech and Morphological Tagset for Swedish}, booktitle = {Seventh Swedish Language Technology Conference (SLTC), Stockholm, 7-9 November, 2018}, author = {Adesam, Yvonne and Bouma, Gerlof and Johansson, Richard}, year = {2018}, } @inProceedings{Adesam-Yvonne2018-273839, title = {The Eukalyptus Treebank of Written Swedish}, booktitle = {Seventh Swedish Language Technology Conference (SLTC), Stockholm, 7–9 November 2018}, author = {Adesam, Yvonne and Bouma, Gerlof and Johansson, Richard and Borin, Lars and Forsberg, Markus}, year = {2018}, } @inProceedings{Adesam-Yvonne2018-273835, title = {Exploring the Quality of the Digital Historical Newspaper Archive KubHist}, booktitle = {Seventh Swedish Language Technology Conference (SLTC), Stockholm, 7-9 November 2018}, author = {Adesam, Yvonne and Dannélls, Dana and Tahmasebi, Nina}, year = {2018}, } @inProceedings{Adesam-Yvonne2018-273841, title = {The Koala Part-of-Speech and Morphological Tagset for Swedish}, booktitle = {Seventh Swedish Language Technology Conference (SLTC), Stockholm, 7-9 November, 2018}, author = {Adesam, Yvonne and Bouma, Gerlof and Johansson, Richard}, year = {2018}, } @inProceedings{Adesam-Yvonne2018-273839, title = {The Eukalyptus Treebank of Written Swedish}, booktitle = {Seventh Swedish Language Technology Conference (SLTC), Stockholm, 7–9 November 2018}, author = {Adesam, Yvonne and Bouma, Gerlof and Johansson, Richard and Borin, Lars and Forsberg, Markus}, year = {2018}, } @inProceedings{Adesam-Yvonne2018-273835, title = {Exploring the Quality of the Digital Historical Newspaper Archive KubHist}, booktitle = {Seventh Swedish Language Technology Conference (SLTC), Stockholm, 7-9 November 2018}, author = {Adesam, Yvonne and Dannélls, Dana and Tahmasebi, Nina}, year = {2018}, } @inProceedings{Adesam-Yvonne2018-279802, title = {Exploring the Quality of the Digital Historical Newspaper Archive KubHist}, booktitle = {Abstracts of the Swedish Language Technology Conference (SLTC), October 7-9, 2018, Stockholm}, author = {Adesam, Yvonne and Dannélls, Dana and Tahmasebi, Nina}, year = {2018}, } @inProceedings{Adesam-Yvonne2018-273841, title = {The Koala Part-of-Speech and Morphological Tagset for Swedish}, booktitle = {Seventh Swedish Language Technology Conference (SLTC), Stockholm, 7-9 November, 2018}, author = {Adesam, Yvonne and Bouma, Gerlof and Johansson, Richard}, year = {2018}, } @inProceedings{Adesam-Yvonne2018-273839, title = {The Eukalyptus Treebank of Written Swedish}, booktitle = {Seventh Swedish Language Technology Conference (SLTC), Stockholm, 7–9 November 2018}, author = {Adesam, Yvonne and Bouma, Gerlof and Johansson, Richard and Borin, Lars and Forsberg, Markus}, year = {2018}, } @inProceedings{Adesam-Yvonne2018-273835, title = {Exploring the Quality of the Digital Historical Newspaper Archive KubHist}, booktitle = {Seventh Swedish Language Technology Conference (SLTC), Stockholm, 7-9 November 2018}, author = {Adesam, Yvonne and Dannélls, Dana and Tahmasebi, Nina}, year = {2018}, } @misc{Bouma-Gerlof2017-254435, title = {Proceedings of the NoDaLiDa 2017 Workshop on Processing Historical Language}, author = {Bouma, Gerlof and Adesam, Yvonne}, year = {2017}, publisher = {Linköping University Electronic Press, Linköpings universitet}, adress = {Linköping}, ISBN = {978-91-7685-503-4}, } @inProceedings{Johansson-Richard2016-233140, title = {A Multi-domain Corpus of Swedish Word Sense Annotation}, abstract = {We describe the word sense annotation layer in Eukalyptus, a freely available five-domain corpus of contemporary Swedish with several annotation layers. The annotation uses the SALDO lexicon to define the sense inventory, and allows word sense annotation of compound segments and multiword units. We give an overview of the new annotation tool developed for this project, and finally present an analysis of the inter-annotator agreement between two annotators. }, booktitle = {10th edition of the Language Resources and Evaluation Conference, 23-28 May 2016, Portorož (Slovenia)}, author = {Johansson, Richard and Adesam, Yvonne and Bouma, Gerlof and Hedberg, Karin}, year = {2016}, publisher = {European Language Resources Association}, ISBN = {978-2-9517408-9-1}, } @article{Adesam-Yvonne2016-237884, title = {Språkteknologi för svenska språket genom tiderna}, abstract = {Språkbanken, the Swedish Language Bank, is a language technology research unit at the Department of Swedish, University of Gothenburg. We develop language resources – such as corpora, lexical resources, and analytical tools – for all variants of Swedish, from Old Swedish laws to present-day social media. Historical texts offer exciting theoretical and methodological challenges for language technology because they often defy the assumption inherent in most automatic analysis tools that the texts contain a standardized written language. In this article, we describe our ongoing work on the development of annotated historical corpora, as well as our efforts on linking various resources (both corpora and lexical resources). This research advances the state of the art of language technology as well as enables new research for scholars in other disciplines.}, author = {Adesam, Yvonne and Ahlberg, Malin and Andersson, Peter and Borin, Lars and Bouma, Gerlof and Forsberg, Markus}, year = {2016}, volume = {76}, number = {Studier i svensk språkhistoria 13}, pages = {65--87}, } @inProceedings{Adesam-Yvonne2016-251827, title = {Old Swedish Part-of-Speech Tagging between Variation and External Knowledge}, booktitle = {Proceedings of the 10th SIGHUM Workshop on Language Technology for Cultural Heritage, Social Sciences, and Humanities, Berlin, Germany, August 11, 2016}, author = {Adesam, Yvonne and Bouma, Gerlof}, year = {2016}, publisher = {Association for Computational Linguistics}, adress = {Stroudsburg, PA }, ISBN = {978-1-945626-09-8}, } @inProceedings{Bouma-Gerlof2016-251825, title = {Multiword Annotation in the Eukalyptus Treebank of Written Swedish}, booktitle = {PARSEME, 6th general meeting, 7-8 April 2016, Struga, FYR Macedonia }, author = {Bouma, Gerlof and Adesam, Yvonne}, year = {2016}, } @inProceedings{Bouma-Gerlof2016-254389, title = {Part-of-speech and Morphology Tagging Old Swedish}, booktitle = {Proceedings of the Sixth Swedish Language Technology Conference (SLTC) Umeå University, 17-18 November, 2016}, author = {Bouma, Gerlof and Adesam, Yvonne}, year = {2016}, } @inProceedings{Cap-Fabienne2016-254388, title = {SWORD: Towards Cutting-Edge Swedish Word Processing}, abstract = {Despite many years of research on Swedish language technology, there is still no well-documented standard for Swedish word processing covering the whole spectrum from low-level tokenization to morphological analysis and disambiguation. SWORD is a new initiative within the SWE-CLARIN consortium aiming to develop documented standards for Swedish word processing. In this paper, we report on a pilot study of Swedish tokenization, where we compare the output of six different tokenizers on four different text types. For one text type (Wikipedia articles), we also compare to the tokenization produced by six manual annotators.}, booktitle = {Proceedings of the Sixth Swedish Language Technology Conference (SLTC) Umeå University, 17-18 November, 2016}, author = {Cap, Fabienne and Adesam, Yvonne and Ahrenberg, Lars and Borin, Lars and Bouma, Gerlof and Forsberg, Markus and Kann, Viggo and Östling, Robert and Smith, Aaron and Wirén, Mats and Nivre, Joakim}, year = {2016}, } @inProceedings{Adesam-Yvonne2015-217815, title = {Defining the Eukalyptus forest – the Koala treebank of Swedish}, abstract = {This paper details the design of the lexical and syntactic layers of a new annotated corpus of Swedish contemporary texts. In order to make the corpus adaptable into a variety of representations, the annotation is of a hybrid type with head-marked constituents and function-labeled edges, and with a rich annotation of non-local dependencies. The source material has been taken from public sources, to allow the resulting corpus to be made freely available.}, booktitle = {Proceedings of the 20th Nordic Conference of Computational Linguistics, NODALIDA 2015, May 11-13, 2015, Vilnius, Lithuania. Edited by Beáta Megyesi}, author = {Adesam, Yvonne and Bouma, Gerlof and Johansson, Richard}, year = {2015}, ISBN = {978-91-7519-098-3}, pages = {1--9}, } @inProceedings{Adesam-Yvonne2015-228833, title = {Multiwords, Word Senses and Multiword Senses in the Eukalyptus Treebank of Written Swedish}, abstract = {Multiwords reside at the intersection of the lexicon and syntax and in an annotation project, they will affect both levels. In the Eukalyptus treebank of written Swedish, we treat multiwords formally as syntactic objects, which are assigned a lexical type and sense. With the help of a simple dichotomy, analyzed vs unanalyzed multiwords, and the expressiveness of the syntactic annotation formalism employed, we are able to flexibly handle most multiword types and usages.}, booktitle = {Proceedings of the Fourteenth International Workshop on Treebanks and Linguistic Theories (TLT14), 11–12 December 2015 Warsaw, Poland}, author = {Adesam, Yvonne and Bouma, Gerlof and Johansson, Richard}, year = {2015}, ISBN = {978-83-63159-18-4}, pages = {3--12}, } @inProceedings{Adesam-Yvonne2014-198794, title = {Computer-aided Morphology Expansion for Old Swedish}, abstract = {In this paper we describe and evaluate a tool for paradigm induction and lexicon extraction that has been applied to Old Swedish. The tool is semi-supervised and uses a small seed lexicon and unannotated corpora to derive full inflection tables for input lemmata. In the work presented here, the tool has been modified to deal with the rich spelling variation found in Old Swedish texts. We also present some initial experiments, which are the first steps towards creating a large-scale morphology for Old Swedish.}, booktitle = {Proceedings of the Ninth International Conference on Language Resources and Evaluation (LREC'14) May 26-31, 2014 Reykjavik, Iceland }, author = {Adesam, Yvonne and Ahlberg, Malin and Andersson, Peter and Bouma, Gerlof and Forsberg, Markus and Hulden, Mans}, year = {2014}, ISBN = { 978-2-9517408-8-4}, pages = {1102--1105}, } @inProceedings{Adesam-Yvonne2014-211376, title = {Koala – Korp’s Linguistic Annotations Developing an infrastructure for text-based research with high-quality annotations}, booktitle = {Proceedings of the Fifth Swedish Language Technology Conference, Uppsala, 13-14 November 2014}, author = {Adesam, Yvonne and Borin, Lars and Bouma, Gerlof and Forsberg, Markus and Johansson, Richard}, year = {2014}, } @inProceedings{Bouma-Gerlof2013-177631, title = {Experiments on sentence segmentation in Old Swedish editions}, booktitle = {NEALT Proceedings Series }, author = {Bouma, Gerlof and Adesam, Yvonne}, year = {2013}, volume = {18}, ISBN = {978-91-7519-587-2}, } @inProceedings{Adesam-Yvonne2012-163218, title = {bokstaffua, bokstaffwa, bokstafwa, bokstaua, bokstawa... Towards lexical link-up for a corpus of Old Swedish}, booktitle = {Proceedings of the LTHist workshop at Konvens}, author = {Adesam, Yvonne and Ahlberg, Malin and Bouma, Gerlof}, year = {2012}, } @inProceedings{Adesam-Yvonne2012-166657, title = {Processing spelling variation in historical text}, booktitle = {Proceedings of the Fourth Swedish Language Technology Conference (SLTC)}, author = {Adesam, Yvonne and Ahlberg, Malin and Bouma, Gerlof}, year = {2012}, }