Task Clusters
Sahara is designed to evaluate model performance across a diverse set of languages and tasks, reflecting the continent's rich linguistic landscape. The benchmark comprises 16 tasks across four primary task clusters, providing a robust framework for evaluation.
| 
 | |||||
| Task Name | Identifier | Score Metric | |||
| Context-based Question Answering | squad_qa | Macro F1 |  | ||
| General Knowledge | mmlu | Accuracy |  | ||
| Mathematical Word Problems | mgsm | Exact Match |  | ||
| Reading Comprehension | belebele | Accuracy |  | ||
| 
 | |||||
| Task Name | Identifier | Score Metric | |||
| Cross-Lingual Natural Language Inference | xlni | Accuracy |  | ||
| Language Identification | lid | Macro F1 |  | ||
| News Classification | news | Macro F1 |  | ||
| Sentiment Analysis | sentiment | Macro F1 |  | ||
| Topic Classification | topic | Macro F1 |  | ||
| 
 | |||||
| Task Name | Identifier | Score Metric | |||
| Machine Translation - African to African | mt_xx2xx | spBleu-1K |  | ||
| Machine Translation - English to African | mt_eng2xx | spBleu-1K |  | ||
| Machine Translation - French to African | mt_fra2xx | spBleu-1K |  | ||
| Paraphrase | paraphrase | spBleu-1K |  | ||
| Summarization | summary | RougeL |  | ||
| Title Generation | title | spBleu-1K |  | ||
| 
 | |||||
| Task Name | Identifier | Score Metric | |||
| NER | ner | Macro F1 |  | ||
| Phrase Chunking | phrase | Macro F1 |  | ||
| POS Tagging | pos | Macro F1 |  | ||
              
              Context-based Question Answering Task BibTeX
            
            @article{10.1162/tacl_a_00317,
    author = {Clark, Jonathan H. and Choi, Eunsol and Collins, Michael and Garrette, Dan and Kwiatkowski, Tom and Nikolaev, Vitaly and Palomaki, Jennimaria},
    title = {TyDi QA: A Benchmark for Information-Seeking Question Answering in Typologically Diverse Languages},
    journal = {Transactions of the Association for Computational Linguistics},
    volume = {8},
    pages = {454-470},
    year = {2020},
    month = {07},
    issn = {2307-387X},
    doi = {10.1162/tacl_a_00317},
    url = {https://doi.org/10.1162/tacl\_a\_00317},
    eprint = {https://direct.mit.edu/tacl/article-pdf/doi/10.1162/tacl\_a\_00317/1923348/tacl\_a\_00317.pdf},
}
              
              General Knowledge Task BibTeX
            
            @article{adelani2024irokobench,
  title={Irokobench: A new benchmark for african languages in the age of large language models},
  author={Adelani, David Ifeoluwa and Ojo, Jessica and Azime, Israel Abebe and Zhuang, Jian Yun and Alabi, Jesujoba O and He, Xuanli and Ochieng, Millicent and Hooker, Sara and Bukula, Andiswa and Lee, En-Shiun Annie and others},
  journal={arXiv preprint arXiv:2406.03368},
  year={2024}
}
              
              Mathematical Word Problems Task BibTeX
            
            @article{adelani2024irokobench,
  title={Irokobench: A new benchmark for african languages in the age of large language models},
  author={Adelani, David Ifeoluwa and Ojo, Jessica and Azime, Israel Abebe and Zhuang, Jian Yun and Alabi, Jesujoba O and He, Xuanli and Ochieng, Millicent and Hooker, Sara and Bukula, Andiswa and Lee, En-Shiun Annie and others},
  journal={arXiv preprint arXiv:2406.03368},
  year={2024}
}
              
              Reading Comprehension Task BibTeX
            
            @inproceedings{bandarkar-etal-2024-belebele,
    title = ""The Belebele Benchmark: a Parallel Reading Comprehension Dataset in 122 Language Variants"",
    author = ""Bandarkar, Lucas  and
      Liang, Davis  and
      Muller, Benjamin  and
      Artetxe, Mikel  and
      Shukla, Satya Narayan  and
      Husa, Donald  and
      Goyal, Naman  and
      Krishnan, Abhinandan  and
      Zettlemoyer, Luke  and
      Khabsa, Madian"",
    editor = ""Ku, Lun-Wei  and
      Martins, Andre  and
      Srikumar, Vivek"",
    booktitle = ""Proceedings of the 62nd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)"",
    month = aug,
    year = ""2024"",
    address = ""Bangkok, Thailand"",
    publisher = ""Association for Computational Linguistics"",
    url = ""https://aclanthology.org/2024.acl-long.44/"",
    doi = ""10.18653/v1/2024.acl-long.44"",
    pages = ""749--775"",}
              
              Cross-Lingual Natural Language Inference Task BibTeX
            
            @article{adelani2024irokobench,
  title={Irokobench: A new benchmark for african languages in the age of large language models},
  author={Adelani, David Ifeoluwa and Ojo, Jessica and Azime, Israel Abebe and Zhuang, Jian Yun and Alabi, Jesujoba O and He, Xuanli and Ochieng, Millicent and Hooker, Sara and Bukula, Andiswa and Lee, En-Shiun Annie and others},
  journal={arXiv preprint arXiv:2406.03368},
  year={2024}
}
              
              Language Identification Task BibTeX
            
            @article{adebara2022afrolid,
  title={AfroLID: A neural language identification tool for African languages},
  author={Adebara, Ife and Elmadany, AbdelRahim and Abdul-Mageed, Muhammad and Inciarte, Alcides Alcoba},
  journal={arXiv preprint arXiv:2210.11744},
  year={2022}
}
              
              News Classification Task BibTeX
            
            --- Citation 1 ---
@article{azime2021amharic,
  title={{An Amharic News Text Classification Dataset}},
  author={Azime, Israel Abebe and Mohammed, Nebil},
  journal={arXiv preprint arXiv:2103.05639},
  year={2021},
  note={Includes a labeled dataset of 50k+ Amharic news articles across 6 categories},
  url={https://doi.org/10.48550/arXiv.2103.05639}
}
--- Citation 2 ---
@inproceedings{niyongabo-etal-2020-kinnews,
    title = ""{KINNEWS} and {KIRNEWS}: Benchmarking Cross-Lingual Text Classification for {K}inyarwanda and {K}irundi"",
    author = ""Niyongabo, Rubungo Andre  and
      Hong, Qu  and
      Kreutzer, Julia  and
      Huang, Li"",
    editor = ""Scott, Donia  and
      Bel, Nuria  and
      Zong, Chengqing"",
    booktitle = ""Proceedings of the 28th International Conference on Computational Linguistics"",
    month = dec,
    year = ""2020"",
    address = ""Barcelona, Spain Online"",
    publisher = ""International Committee on Computational Linguistics"",
    url = ""https://aclanthology.org/2020.coling-main.480/"",
    doi = ""10.18653/v1/2020.coling-main.480"",
    pages = ""5507--5521"",
}
--- Citation 3 ---
@dataset{david2020swahili,
  author       = {Davis David},
  title        = {{Swahili : News Classification Dataset}},
  month        = dec,
  year         = 2020,
  publisher    = {Zenodo},
  version      = {0.1},
  doi          = {10.5281/zenodo.4300294},
  url          = {https://doi.org/10.5281/zenodo.4300294}
}
              
              Sentiment Analysis Task BibTeX
            
            --- Citation 1 ---
@article{diallo2021bambara,
  title={{Bambara Language Dataset for Sentiment Analysis}},
  author={Diallo, Mountaga and Fourati, Chayma and Haddad, Hatem},
  journal={arXiv preprint arXiv:2108.02524},
  year={2021},
  note={Presented at the 2nd Workshop on Practical ML for Developing Countries at ICLR 2021},
  url={https://doi.org/10.48550/arXiv.2108.02524}
}
--- Citation 2 ---
@article{oyewusi2020semantic,
  title={{Semantic Enrichment of Nigerian Pidgin English for Contextual Sentiment Classification}},
  author={Oyewusi, Wuraola Fisayo and Akinsande, Olalekan},
  journal={arXiv preprint arXiv:2003.12450},
  year={2020},
  url={https://arxiv.org/abs/2003.12450}
}
--- Citation 3 ---
@article{shode2022yosm,
  title={{YOSM: A New Yoruba Sentiment Corpus for Movie Reviews}},
  author={Shode, Iyanuoluwa and Adelani, David Ifeoluwa and Feldman, Anna},
  journal={arXiv preprint arXiv:2204.09711},
  year={2022},
  note={Accepted to AfricaNLP Workshop @ ICLR 2022},
  url={https://doi.org/10.48550/arXiv.2204.09711}
}
              
              Topic Classification Task BibTeX
            
            @inproceedings{hedderich-etal-2020-transfer,
    title = ""Transfer Learning and Distant Supervision for Multilingual Transformer Models: A Study on {A}frican Languages"",
    author = ""Hedderich, Michael A.  and
      Adelani, David I.  and
      Zhu, Dawei  and
      Alabi, Jesujoba  and
      Markus, Udia  and
      Klakow, Dietrich"",
    editor = ""Webber, Bonnie  and
      Cohn, Trevor  and
      He, Yulan  and
      Liu, Yang"",
    booktitle = ""Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing (EMNLP)"",
    month = nov,
    year = ""2020"",
    address = ""Online"",
    publisher = ""Association for Computational Linguistics"",
    url = ""https://aclanthology.org/2020.emnlp-main.204/"",
    doi = ""10.18653/v1/2020.emnlp-main.204"",
    pages = ""2580--2591"",
}
              
              Machine Translation - African to African Task BibTeX
            
            --- Citation 1 ---
@inproceedings{adelani-etal-2022-thousand,
    title = ""A Few Thousand Translations Go a Long Way! Leveraging Pre-trained Models for {A}frican News Translation"",
    author = ""Adelani, David Ifeoluwa  and
      Alabi, Jesujoba Oluwadara  and
      Fan, Angela  and
      Kreutzer, Julia  and
      Shen, Xiaoyu  and
      Reid, Machel  and
      Ruiter, Dana  and
      Klakow, Dietrich  and
      Nabende, Peter  and
      Chang, Ernie  and
      Gwadabe, Tajuddeen  and
      Sackey, Freshia  and
      Dossou, Bonaventure F. P.  and
      Emezue, Chris  and
      Leong, Colin  and
      Beukman, Michael  and
      Muhammad, Shamsuddeen H.  and
      Jarso, Guyo D.  and
      Yousuf, Oreen  and
      Niyongabo Rubungo, Andre N.  and
      Hacheme, Gilles  and
      Wairagala, Eric Peter  and
      Nasir, Muhammad Umair  and
      Ajibade, Benjamin A.  and
      Ajayi, Tunde Oluwaseyi  and
      Gitau, Yvonne Wambui  and
      Abbott, Jade  and
      Ahmed, Mohamed  and
      Ochieng, Millicent  and
      Aremu, Anuoluwapo  and
      Ogayo, Perez  and
      Mukiibi, Jonathan  and
      Ouoba Kabore, Fatoumata  and
      Kalipe, Godson Koffi  and
      Mbaye, Derguene  and
      Tapo, Allahsera Auguste  and
      Memdjokam Koagne, Victoire M.  and
      Munkoh-Buabeng, Edwin  and
      Wagner, Valencia  and
      Abdulmumin, Idris  and
      Awokoya, Ayodele  and
      Buzaaba, Happy  and
      Sibanda, Blessing  and
      Bukula, Andiswa  and
      Manthalu, Sam"",
    editor = ""Carpuat, Marine  and
      de Marneffe, Marie-Catherine  and
      Meza Ruiz, Ivan Vladimir"",
    booktitle = ""Proceedings of the 2022 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies"",
    month = jul,
    year = ""2022"",
    address = ""Seattle, United States"",
    publisher = ""Association for Computational Linguistics"",
    url = ""https://aclanthology.org/2022.naacl-main.223/"",
    doi = ""10.18653/v1/2022.naacl-main.223"",
    pages = ""3053--3070"",
    }
--- Citation 2 ---
@inproceedings{reid-etal-2021-afromt,
    title = ""{A}fro{MT}: Pretraining Strategies and Reproducible Benchmarks for Translation of 8 {A}frican Languages"",
    author = ""Reid, Machel  and
      Hu, Junjie  and
      Neubig, Graham  and
      Matsuo, Yutaka"",
    editor = ""Moens, Marie-Francine  and
      Huang, Xuanjing  and
      Specia, Lucia  and
      Yih, Scott Wen-tau"",
    booktitle = ""Proceedings of the 2021 Conference on Empirical Methods in Natural Language Processing"",
    month = nov,
    year = ""2021"",
    address = ""Online and Punta Cana, Dominican Republic"",
    publisher = ""Association for Computational Linguistics"",
    url = ""https://aclanthology.org/2021.emnlp-main.99/"",
    doi = ""10.18653/v1/2021.emnlp-main.99"",
    pages = ""1306--1320"",
    }
--- Citation 3 ---
@article{ogueji2019pidginunmt,
  title={{PidginUNMT: Unsupervised Neural Machine Translation from West African Pidgin to English}},
  author={Ogueji, Kelechi and Ahia, Orevaoghene},
  journal={arXiv preprint arXiv:1912.03444},
  year={2019},
  note={Presented at the NeurIPS 2019 Workshop on Machine Learning for the Developing World},
  url={https://doi.org/10.48550/arXiv.1912.03444}
},@inproceedings{akera2022machine,
  title={{Machine Translation For African Languages: Community Creation Of Datasets And Models In Uganda}},
  author={Akera, Benjamin and Mukiibi, Jonathan and Naggayi, Lydia Sanyu and Babirye, Claire and Owomugisha, Isaac and Nsumba, Solomon and et al.},
  booktitle={Proceedings of the AfricaNLP Workshop at ICLR 2022},
  year={2022},
  note={Accepted to AfricaNLP 2022 Workshop, last revised May 2023},
  url={https://github.com/SunbirdAI/salt}
}
              
              Machine Translation - English to African   Task BibTeX
            
            --- Citation 1 ---
@inproceedings{adelani-etal-2022-thousand,
    title = ""A Few Thousand Translations Go a Long Way! Leveraging Pre-trained Models for {A}frican News Translation"",
    author = ""Adelani, David Ifeoluwa  and
      Alabi, Jesujoba Oluwadara  and
      Fan, Angela  and
      Kreutzer, Julia  and
      Shen, Xiaoyu  and
      Reid, Machel  and
      Ruiter, Dana  and
      Klakow, Dietrich  and
      Nabende, Peter  and
      Chang, Ernie  and
      Gwadabe, Tajuddeen  and
      Sackey, Freshia  and
      Dossou, Bonaventure F. P.  and
      Emezue, Chris  and
      Leong, Colin  and
      Beukman, Michael  and
      Muhammad, Shamsuddeen H.  and
      Jarso, Guyo D.  and
      Yousuf, Oreen  and
      Niyongabo Rubungo, Andre N.  and
      Hacheme, Gilles  and
      Wairagala, Eric Peter  and
      Nasir, Muhammad Umair  and
      Ajibade, Benjamin A.  and
      Ajayi, Tunde Oluwaseyi  and
      Gitau, Yvonne Wambui  and
      Abbott, Jade  and
      Ahmed, Mohamed  and
      Ochieng, Millicent  and
      Aremu, Anuoluwapo  and
      Ogayo, Perez  and
      Mukiibi, Jonathan  and
      Ouoba Kabore, Fatoumata  and
      Kalipe, Godson Koffi  and
      Mbaye, Derguene  and
      Tapo, Allahsera Auguste  and
      Memdjokam Koagne, Victoire M.  and
      Munkoh-Buabeng, Edwin  and
      Wagner, Valencia  and
      Abdulmumin, Idris  and
      Awokoya, Ayodele  and
      Buzaaba, Happy  and
      Sibanda, Blessing  and
      Bukula, Andiswa  and
      Manthalu, Sam"",
    editor = ""Carpuat, Marine  and
      de Marneffe, Marie-Catherine  and
      Meza Ruiz, Ivan Vladimir"",
    booktitle = ""Proceedings of the 2022 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies"",
    month = jul,
    year = ""2022"",
    address = ""Seattle, United States"",
    publisher = ""Association for Computational Linguistics"",
    url = ""https://aclanthology.org/2022.naacl-main.223/"",
    doi = ""10.18653/v1/2022.naacl-main.223"",
    pages = ""3053--3070"",
    }
--- Citation 2 ---
@inproceedings{reid-etal-2021-afromt,
    title = ""{A}fro{MT}: Pretraining Strategies and Reproducible Benchmarks for Translation of 8 {A}frican Languages"",
    author = ""Reid, Machel  and
      Hu, Junjie  and
      Neubig, Graham  and
      Matsuo, Yutaka"",
    editor = ""Moens, Marie-Francine  and
      Huang, Xuanjing  and
      Specia, Lucia  and
      Yih, Scott Wen-tau"",
    booktitle = ""Proceedings of the 2021 Conference on Empirical Methods in Natural Language Processing"",
    month = nov,
    year = ""2021"",
    address = ""Online and Punta Cana, Dominican Republic"",
    publisher = ""Association for Computational Linguistics"",
    url = ""https://aclanthology.org/2021.emnlp-main.99/"",
    doi = ""10.18653/v1/2021.emnlp-main.99"",
    pages = ""1306--1320"",
    }
--- Citation 3 ---
@article{ogueji2019pidginunmt,
  title={{PidginUNMT: Unsupervised Neural Machine Translation from West African Pidgin to English}},
  author={Ogueji, Kelechi and Ahia, Orevaoghene},
  journal={arXiv preprint arXiv:1912.03444},
  year={2019},
  note={Presented at the NeurIPS 2019 Workshop on Machine Learning for the Developing World},
  url={https://doi.org/10.48550/arXiv.1912.03444}
},@inproceedings{akera2022machine,
  title={{Machine Translation For African Languages: Community Creation Of Datasets And Models In Uganda}},
  author={Akera, Benjamin and Mukiibi, Jonathan and Naggayi, Lydia Sanyu and Babirye, Claire and Owomugisha, Isaac and Nsumba, Solomon and et al.},
  booktitle={Proceedings of the AfricaNLP Workshop at ICLR 2022},
  year={2022},
  note={Accepted to AfricaNLP 2022 Workshop, last revised May 2023},
  url={https://github.com/SunbirdAI/salt}
}
              
              Machine Translation - French to African Task BibTeX
            
            --- Citation 1 ---
@inproceedings{adelani-etal-2022-thousand,
    title = ""A Few Thousand Translations Go a Long Way! Leveraging Pre-trained Models for {A}frican News Translation"",
    author = ""Adelani, David Ifeoluwa  and
      Alabi, Jesujoba Oluwadara  and
      Fan, Angela  and
      Kreutzer, Julia  and
      Shen, Xiaoyu  and
      Reid, Machel  and
      Ruiter, Dana  and
      Klakow, Dietrich  and
      Nabende, Peter  and
      Chang, Ernie  and
      Gwadabe, Tajuddeen  and
      Sackey, Freshia  and
      Dossou, Bonaventure F. P.  and
      Emezue, Chris  and
      Leong, Colin  and
      Beukman, Michael  and
      Muhammad, Shamsuddeen H.  and
      Jarso, Guyo D.  and
      Yousuf, Oreen  and
      Niyongabo Rubungo, Andre N.  and
      Hacheme, Gilles  and
      Wairagala, Eric Peter  and
      Nasir, Muhammad Umair  and
      Ajibade, Benjamin A.  and
      Ajayi, Tunde Oluwaseyi  and
      Gitau, Yvonne Wambui  and
      Abbott, Jade  and
      Ahmed, Mohamed  and
      Ochieng, Millicent  and
      Aremu, Anuoluwapo  and
      Ogayo, Perez  and
      Mukiibi, Jonathan  and
      Ouoba Kabore, Fatoumata  and
      Kalipe, Godson Koffi  and
      Mbaye, Derguene  and
      Tapo, Allahsera Auguste  and
      Memdjokam Koagne, Victoire M.  and
      Munkoh-Buabeng, Edwin  and
      Wagner, Valencia  and
      Abdulmumin, Idris  and
      Awokoya, Ayodele  and
      Buzaaba, Happy  and
      Sibanda, Blessing  and
      Bukula, Andiswa  and
      Manthalu, Sam"",
    editor = ""Carpuat, Marine  and
      de Marneffe, Marie-Catherine  and
      Meza Ruiz, Ivan Vladimir"",
    booktitle = ""Proceedings of the 2022 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies"",
    month = jul,
    year = ""2022"",
    address = ""Seattle, United States"",
    publisher = ""Association for Computational Linguistics"",
    url = ""https://aclanthology.org/2022.naacl-main.223/"",
    doi = ""10.18653/v1/2022.naacl-main.223"",
    pages = ""3053--3070"",
    }
--- Citation 2 ---
@inproceedings{reid-etal-2021-afromt,
    title = ""{A}fro{MT}: Pretraining Strategies and Reproducible Benchmarks for Translation of 8 {A}frican Languages"",
    author = ""Reid, Machel  and
      Hu, Junjie  and
      Neubig, Graham  and
      Matsuo, Yutaka"",
    editor = ""Moens, Marie-Francine  and
      Huang, Xuanjing  and
      Specia, Lucia  and
      Yih, Scott Wen-tau"",
    booktitle = ""Proceedings of the 2021 Conference on Empirical Methods in Natural Language Processing"",
    month = nov,
    year = ""2021"",
    address = ""Online and Punta Cana, Dominican Republic"",
    publisher = ""Association for Computational Linguistics"",
    url = ""https://aclanthology.org/2021.emnlp-main.99/"",
    doi = ""10.18653/v1/2021.emnlp-main.99"",
    pages = ""1306--1320"",
    }
--- Citation 3 ---
@article{ogueji2019pidginunmt,
  title={{PidginUNMT: Unsupervised Neural Machine Translation from West African Pidgin to English}},
  author={Ogueji, Kelechi and Ahia, Orevaoghene},
  journal={arXiv preprint arXiv:1912.03444},
  year={2019},
  note={Presented at the NeurIPS 2019 Workshop on Machine Learning for the Developing World},
  url={https://doi.org/10.48550/arXiv.1912.03444}
},@inproceedings{akera2022machine,
  title={{Machine Translation For African Languages: Community Creation Of Datasets And Models In Uganda}},
  author={Akera, Benjamin and Mukiibi, Jonathan and Naggayi, Lydia Sanyu and Babirye, Claire and Owomugisha, Isaac and Nsumba, Solomon and et al.},
  booktitle={Proceedings of the AfricaNLP Workshop at ICLR 2022},
  year={2022},
  note={Accepted to AfricaNLP 2022 Workshop, last revised May 2023},
  url={https://github.com/SunbirdAI/salt}
}
              
              Paraphrase Task BibTeX
            
            @inproceedings{scherrer-2020-tapaco,
    title = ""{T}a{P}a{C}o: A Corpus of Sentential Paraphrases for 73 Languages"",
    author = ""Scherrer, Yves"",
    editor = ""Calzolari, Nicoletta  and
      B{'e}chet, Fr{'e}d{'e}ric  and
      Blache, Philippe  and
      Choukri, Khalid  and
      Cieri, Christopher  and
      Declerck, Thierry  and
      Goggi, Sara  and
      Isahara, Hitoshi  and
      Maegaard, Bente  and
      Mariani, Joseph  and
      Mazo, H{'e}l{`e}ne  and
      Moreno, Asuncion  and
      Odijk, Jan  and
      Piperidis, Stelios"",
    booktitle = ""Proceedings of the Twelfth Language Resources and Evaluation Conference"",
    month = may,
    year = ""2020"",
    address = ""Marseille, France"",
    publisher = ""European Language Resources Association"",
    url = ""https://aclanthology.org/2020.lrec-1.848/"",
    pages = ""6868--6873"",
    language = ""eng"",
    ISBN = ""979-10-95546-34-4"",
}
              
              Summarization Task BibTeX
            
            @inproceedings{hasan-etal-2021-xl,
    title = ""{XL}-Sum: Large-Scale Multilingual Abstractive Summarization for 44 Languages"",
    author = ""Hasan, Tahmid  and
      Bhattacharjee, Abhik  and
      Islam, Md. Saiful  and
      Mubasshir, Kazi  and
      Li, Yuan-Fang  and
      Kang, Yong-Bin  and
      Rahman, M. Sohel  and
      Shahriyar, Rifat"",
    editor = ""Zong, Chengqing  and
      Xia, Fei  and
      Li, Wenjie  and
      Navigli, Roberto"",
    booktitle = ""Findings of the Association for Computational Linguistics: ACL-IJCNLP 2021"",
    month = aug,
    year = ""2021"",
    address = ""Online"",
    publisher = ""Association for Computational Linguistics"",
    url = ""https://aclanthology.org/2021.findings-acl.413/"",
    doi = ""10.18653/v1/2021.findings-acl.413"",
    pages = ""4693--4703""
},@article{adebara2024cheetah,
  title={Cheetah: Natural language generation for 517 african languages},
  author={Adebara, Ife and Elmadany, AbdelRahim and Abdul-Mageed, Muhammad},
  journal={arXiv preprint arXiv:2401.01053},
  year={2024}
}
              
              Title Generation Task BibTeX
            
            --- Citation 1 ---
@inproceedings{hasan-etal-2021-xl,
    title = ""{XL}-Sum: Large-Scale Multilingual Abstractive Summarization for 44 Languages"",
    author = ""Hasan, Tahmid  and
      Bhattacharjee, Abhik  and
      Islam, Md. Saiful  and
      Mubasshir, Kazi  and
      Li, Yuan-Fang  and
      Kang, Yong-Bin  and
      Rahman, M. Sohel  and
      Shahriyar, Rifat"",
    editor = ""Zong, Chengqing  and
      Xia, Fei  and
      Li, Wenjie  and
      Navigli, Roberto"",
    booktitle = ""Findings of the Association for Computational Linguistics: ACL-IJCNLP 2021"",
    month = aug,
    year = ""2021"",
    address = ""Online"",
    publisher = ""Association for Computational Linguistics"",
    url = ""https://aclanthology.org/2021.findings-acl.413/"",
    doi = ""10.18653/v1/2021.findings-acl.413"",
    pages = ""4693--4703""
}
--- Citation 2 ---
@article{adebara2024cheetah,
  title={Cheetah: Natural language generation for 517 african languages},
  author={Adebara, Ife and Elmadany, AbdelRahim and Abdul-Mageed, Muhammad},
  journal={arXiv preprint arXiv:2401.01053},
  year={2024}
}
              
              NER Task BibTeX
            
            --- Citation 1 ---
@article{adelani2021masakhaner,
  title={MasakhaNER: Named entity recognition for African languages},
  author={Adelani, David Ifeoluwa and Abbott, Jade and Neubig, Graham and D’souza, Daniel and Kreutzer, Julia and Lignos, Constantine and Palen-Michel, Chester and Buzaaba, Happy and Rijhwani, Shruti and Ruder, Sebastian and others},
  journal={Transactions of the Association for Computational Linguistics},
  volume={9},
  pages={1116--1131},
  year={2021},
  publisher={MIT Press One Rogers Street, Cambridge, MA 02142-1209, USA journals-info~…}
}
--- Citation 2 ---
@article{adelani2022masakhaner,
  title={Masakhaner 2.0: Africa-centric transfer learning for named entity recognition},
  author={Adelani, David Ifeoluwa and Neubig, Graham and Ruder, Sebastian and Rijhwani, Shruti and Beukman, Michael and Palen-Michel, Chester and Lignos, Constantine and Alabi, Jesujoba O and Muhammad, Shamsuddeen H and Nabende, Peter and others},
  journal={arXiv preprint arXiv:2210.12391},
  year={2022}
}
--- Citation 3 ---
@inproceedings{eiselen-2016-government,
    title = ""Government Domain Named Entity Recognition for {S}outh {A}frican Languages"",
    author = ""Eiselen, Roald"",
    editor = ""Calzolari, Nicoletta  and
      Choukri, Khalid  and
      Declerck, Thierry  and
      Goggi, Sara  and
      Grobelnik, Marko  and
      Maegaard, Bente  and
      Mariani, Joseph  and
      Mazo, Helene  and
      Moreno, Asuncion  and
      Odijk, Jan  and
      Piperidis, Stelios"",
    booktitle = ""Proceedings of the Tenth International Conference on Language Resources and Evaluation {LREC}`16"",
    month = may,
    year = ""2016"",
    address = ""Portoro{v{z}}, Slovenia"",
    publisher = ""European Language Resources Association ELRA"",
    url = ""https://aclanthology.org/L16-1533/"",
    pages = ""3344--3348"",
}
--- Citation 4 ---
@inproceedings{alabi-etal-2020-massive,
    title = ""Massive vs. Curated Embeddings for Low-Resourced Languages: the Case of {Y}or{`u}b{'a} and {T}wi"",
    author = ""Alabi, Jesujoba O.  and
      Amponsah-Kaakyire, Kwabena  and
      Adelani, David I.  and
      Espa{~n}a-Bonet, Cristina"",
    editor = ""Calzolari, Nicoletta  and
      B{'e}chet, Fr{'e}d{'e}ric  and
      Blache, Philippe  and
      Choukri, Khalid  and
      Cieri, Christopher  and
      Declerck, Thierry  and
      Goggi, Sara  and
      Isahara, Hitoshi  and
      Maegaard, Bente  and
      Mariani, Joseph  and
      Mazo, H{'e}l{`e}ne  and
      Moreno, Asuncion  and
      Odijk, Jan  and
      Piperidis, Stelios"",
    booktitle = ""Proceedings of the Twelfth Language Resources and Evaluation Conference"",
    month = may,
    year = ""2020"",
    address = ""Marseille, France"",
    publisher = ""European Language Resources Association"",
    url = ""https://aclanthology.org/2020.lrec-1.335/"",
    pages = ""2754--2762"",
    language = ""eng"",
    ISBN = ""979-10-95546-34-4"",
    abstract = ""The success of several architectures to learn semantic representations from unannotated text and the availability of these kind of texts in online multilingual resources such as Wikipedia has facilitated the massive and automatic creation of resources for multiple languages. The evaluation of such resources is usually done for the high-resourced languages, where one has a smorgasbord of tasks and test sets to evaluate on. For low-resourced languages, the evaluation is more difficult and normally ignored, with the hope that the impressive capability of deep learning architectures to learn (multilingual) representations in the high-resourced setting holds in the low-resourced setting too. In this paper we focus on two African languages, Yor{`u}b{'a} and Twi, and compare the word embeddings obtained in this way, with word embeddings obtained from curated corpora and a language-dependent processing. We analyse the noise in the publicly available corpora, collect high quality and noisy data for the two languages and quantify the improvements that depend not only on the amount of data but on the quality too. We also use different architectures that learn word representations both from surface forms and characters to further exploit all the available information which showed to be important for these languages. For the evaluation, we manually translate the wordsim-353 word pairs dataset from English into Yor{`u}b{'a} and Twi. We extend the analysis to contextual word embeddings and evaluate multilingual BERT on a named entity recognition task. For this, we annotate with named entities the Global Voices corpus for Yor{`u}b{'a}. As output of the work, we provide corpora, embeddings and the test suits for both languages.""
}
--- Citation 5 ---
@inproceedings{pan-etal-2017-cross,
    title = ""Cross-lingual Name Tagging and Linking for 282 Languages"",
    author = ""Pan, Xiaoman  and
      Zhang, Boliang  and
      May, Jonathan  and
      Nothman, Joel  and
      Knight, Kevin  and
      Ji, Heng"",
    editor = ""Barzilay, Regina  and
      Kan, Min-Yen"",
    booktitle = ""Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics Volume 1: Long Papers"",
    month = jul,
    year = ""2017"",
    address = ""Vancouver, Canada"",
    publisher = ""Association for Computational Linguistics"",
    url = ""https://aclanthology.org/P17-1178/"",
    doi = ""10.18653/v1/P17-1178"",
    pages = ""1946--1958"",
}
              
              Phrase Chunking Task BibTeX
            
            @inproceedings{eiselen-2016-government,
    title = ""Government Domain Named Entity Recognition for {S}outh {A}frican Languages"",
    author = ""Eiselen, Roald"",
    editor = ""Calzolari, Nicoletta  and
      Choukri, Khalid  and
      Declerck, Thierry  and
      Goggi, Sara  and
      Grobelnik, Marko  and
      Maegaard, Bente  and
      Mariani, Joseph  and
      Mazo, Helene  and
      Moreno, Asuncion  and
      Odijk, Jan  and
      Piperidis, Stelios"",
    booktitle = ""Proceedings of the Tenth International Conference on Language Resources and Evaluation {LREC}`16"",
    month = may,
    year = ""2016"",
    address = ""Portoro{v{z}}, Slovenia"",
    publisher = ""European Language Resources Association ELRA"",
    url = ""https://aclanthology.org/L16-1533/"",
    pages = ""3344--3348"",
}
              
              POS Tagging Task BibTeX
            
            --- Citation 1 ---
@article{10.1145/3146387,
author = {Onyenwe, Ikechukwu E and Hepple, Mark and Chinedu, Uchechukwu and Ezeani, Ignatius},
title = {A Basic Language Resource Kit Implementation for the IgboNLP Project},
year = {2018},
issue_date = {June 2018},
publisher = {Association for Computing Machinery},
address = {New York, NY, USA},
volume = {17},
number = {2},
issn = {2375-4699},
url = {https://doi.org/10.1145/3146387},
doi = {10.1145/3146387},
abstract = {Igbo, an African language with around 32 million speakers worldwide, is one of the many languages having few or none of the language processing resources needed for advanced language technology applications. In this article, we describe the approach taken to creating an initial set of resources for Igbo, including an electronic text corpus, a part-of-speech (POS) tagset, and a POS-tagged subcorpus. We discuss the approach taken in gathering texts, the preprocessing of these texts, and the development of the POS tagged corpus. We also discuss some of the problems encountered during corpus and tagset development and the solutions arrived at for these problems.},
journal = {ACM Trans. Asian Low-Resour. Lang. Inf. Process.},
month = jan,
articleno = {10},
numpages = {23},
keywords = {tokenization, text processing, tagset, segmentation, part-of-speech (POS) tagging, normalization, morphology, language technology, interannotation agreement, human annotator, corpus annotation, corpora, Natural language processing (NLP), Igbo, African language}
}
--- Citation 2 ---
@article{10.1145/3314942,
author = {Onyenwe, Ikechukwu E. and Hepple, Mark and Chinedu, Uchechukwu and Ezeani, Ignatius},
title = {Toward an Effective Igbo Part-of-Speech Tagger},
year = {2019},
issue_date = {December 2019},
publisher = {Association for Computing Machinery},
address = {New York, NY, USA},
volume = {18},
number = {4},
issn = {2375-4699},
url = {https://doi.org/10.1145/3314942},
doi = {10.1145/3314942},
abstract = {Part-of-speech (POS) tagging is a well-established technology for most Western European languages and a few other world languages, but it has not been evaluated on Igbo, an agglutinative African language. This article presents POS tagging experiments conducted using an Igbo corpus as a test bed for identifying the POS taggers and the Machine Learning (ML) methods that can achieve a good performance with the small dataset available for the language. Experiments have been conducted using different well-known POS taggers developed for English or European languages, and different training data styles and sizes. Igbo has a number of language-specific characteristics that present a challenge for effective POS tagging. One interesting case is the wide use of verbs (and nominalizations thereof) that have an inherent noun complement, which form “linked pairs” in the POS tagging scheme, but which may appear discontinuously. Another issue is Igbo’s highly productive agglutinative morphology, which can produce many variant word forms from a given root. This productivity is a key cause of the out-of-vocabulary (OOV) words observed during Igbo tagging. We report results of experiments on a promising direction for improving tagging performance on such morphologically-inflected OOV words.},
journal = {ACM Trans. Asian Low-Resour. Lang. Inf. Process.},
month = may,
articleno = {42},
numpages = {26},
keywords = {text processing, tagset, part-of-speech (POS) tagging, morphological analysis, machine learning, language technology, corpus annotation, corpora, POS tagger, Natural language processing (NLP), Igbo, African language}
} 
 