Skip to main content


	title        = {Romanized Berber and Romanized Arabic Automatic Language Identification Using Machine Learning},
	abstract     = {The identification of the language of text/speech input is the first step to be able to properly do any language-dependent natural language processing.  The task is called Automatic Language Identification (ALI). Being a well-studied field since early 1960’s, various methods have been applied to many standard languages. The ALI standard methods require datasets for training and use character/word-based n-gram models.   However,  social media and new technologies have contributed to the rise of informal and minority languages on the Web. The state-of-the-art automatic language identifiers fail to properly identify many of them.  Romanized Arabic (RA) and Romanized Berber (RB) are cases of these informal languages which are under-resourced.  The goal of this paper is twofold: detect RA and RB, at a document level, as separate languages and distinguish between them as they coexist in North Africa. We consider the task as a classification problem and use supervised machine learning to solve it.  For both languages, character-based 5-grams combined with additional lexicons score the best, F-score of 99.75% and 97.77% for RB and RA respectively.},
	booktitle    = {Proceedings of the Third Workshop on NLP for Similar Languages, Varieties and Dialects; 53–61; December 12, 2016 ; Osaka, Japan},
	author       = {Adouane, Wafia and Semmar, Nasredine and Johansson, Richard},
	year         = {2016},
	publisher    = {Association for Computational Linguistics},