Skip to main content

BibTeX

@inProceedings{kokkinakis-2006-collection-33937,
	title        = {Collection, Encoding and Linguistic Processing of a Swedish Medical Corpus - The MEDLEX Experience.},
	abstract     = {Corpora annotated with structural and linguistic characteristics play a major role in nearly every area of language processing. 
During recent years a number of corpora and large data sets became known and available to research even in specialized fields
 such as medicine, but still however, targeted predominantly for the English language. This paper provides a description of the 
collection, encoding and linguistic processing of an ever growing Swedish medical corpus, the MEDLEX Corpus. MEDLEX consists
 of a variety of text-documents related to various medical text genres. The MEDLEX Corpus has been structurally annotated 
using the Corpus Encoding Standard for XML (XCES), lemmatized and automatically annotated with part-of-speech and 
semantic information (extended named entities and the Medical Subject Headings, MeSH, terminology). 
The results from the processing stages (part-of-speech, entities and terminology) have been merged into a single 
representation format and syntactically analysed using a cascaded finite state parser. 
Finally, the parser’s results are converted into a tree structure that follows the TIGER-XML coding scheme, resulting a suitable
 for further exploration and fairly large Treebank of Swedish medical texts. 
},
	booktitle    = {Proceedings of the 5th Languages Resources and Evalutaion (LREC)},
	author       = {Kokkinakis, Dimitrios},
	year         = {2006},
}