Spaces:

fair-forward
/

evals-for-every-language

Running

App Files Files Community

evals-for-every-language / bibliography.bib

davidpomerenke

Upload from GitHub Actions: Add citations

34983b9 verified 3 days ago

raw

history blame contribute delete

44.7 kB

	@misc{200LanguagesSingle,
	title = {200 Languages within a Single {{AI}} Model: {{A}} Breakthrough in High-Quality Machine Translation},
	shorttitle = {200 Languages within a Single {{AI}} Model},
	urldate = {2024-11-02},
	abstract = {Meta AI has built a single AI model, NLLB-200, that is the first to translate across 200 different languages with state-of-the-art quality that has been validated through extensive evaluations for each of them.},
	langid = {english},
	keywords = {dataset,model,n=200},
	file = {/Users/david/Zotero/storage/AU759RXC/nllb-200-high-quality-machine-translation.html}
	}

	@misc{adelaniIrokoBenchNewBenchmark2025,
	title = {{{IrokoBench}}: {{A New Benchmark}} for {{African Languages}} in the {{Age}} of {{Large Language Models}}},
	shorttitle = {{{IrokoBench}}},
	author = {Adelani, David Ifeoluwa and Ojo, Jessica and Azime, Israel Abebe and Zhuang, Jian Yun and Alabi, Jesujoba O. and He, Xuanli and Ochieng, Millicent and Hooker, Sara and Bukula, Andiswa and Lee, En-Shiun Annie and Chukwuneke, Chiamaka and Buzaaba, Happy and Sibanda, Blessing and Kalipe, Godson and Mukiibi, Jonathan and Kabongo, Salomon and Yuehgoh, Foutse and Setaka, Mmasibidi and Ndolela, Lolwethu and Odu, Nkiruka and Mabuya, Rooweither and Muhammad, Shamsuddeen Hassan and Osei, Salomey and Samb, Sokhar and Guge, Tadesse Kebede and Sherman, Tombekai Vangoni and Stenetorp, Pontus},
	year = {2025},
	month = jan,
	number = {arXiv:2406.03368},
	eprint = {2406.03368},
	primaryclass = {cs},
	publisher = {arXiv},
	doi = {10.48550/arXiv.2406.03368},
	urldate = {2025-05-28},
	abstract = {Despite the widespread adoption of Large language models (LLMs), their remarkable capabilities remain limited to a few high-resource languages. Additionally, many low-resource languages ({\textbackslash}eg African languages) are often evaluated only on basic text classification tasks due to the lack of appropriate or comprehensive benchmarks outside of high-resource languages. In this paper, we introduce IrokoBench -- a human-translated benchmark dataset for 17 typologically-diverse low-resource African languages covering three tasks: natural language inference{\textasciitilde}(AfriXNLI), mathematical reasoning{\textasciitilde}(AfriMGSM), and multi-choice knowledge-based question answering{\textasciitilde}(AfriMMLU). We use IrokoBench to evaluate zero-shot, few-shot, and translate-test settings{\textasciitilde}(where test sets are translated into English) across 10 open and six proprietary LLMs. Our evaluation reveals a significant performance gap between high-resource languages{\textasciitilde}(such as English and French) and low-resource African languages. We observe a significant performance gap between open and proprietary models, with the highest performing open model, Gemma 2 27B only at 63{\textbackslash}\% of the best-performing proprietary model GPT-4o performance. In addition, machine translating the test set to English before evaluation helped to close the gap for larger models that are English-centric, such as Gemma 2 27B and LLaMa 3.1 70B. These findings suggest that more efforts are needed to develop and adapt LLMs for African languages.},
	archiveprefix = {arXiv},
	file = {/Users/david/Zotero/storage/9X39YMAR/Adelani et al. - 2025 - IrokoBench A New Benchmark for African Languages in the Age of Large Language Models.pdf;/Users/david/Zotero/storage/L69AFMRS/2406.html}
	}

	@misc{adelaniSIB200SimpleInclusive2024,
	title = {{{SIB-200}}: {{A Simple}}, {{Inclusive}}, and {{Big Evaluation Dataset}} for {{Topic Classification}} in 200+ {{Languages}} and {{Dialects}}},
	shorttitle = {{{SIB-200}}},
	author = {Adelani, David Ifeoluwa and Liu, Hannah and Shen, Xiaoyu and Vassilyev, Nikita and Alabi, Jesujoba O. and Mao, Yanke and Gao, Haonan and Lee, Annie En-Shiun},
	year = {2024},
	month = mar,
	number = {arXiv:2309.07445},
	eprint = {2309.07445},
	publisher = {arXiv},
	doi = {10.48550/arXiv.2309.07445},
	urldate = {2024-11-02},
	abstract = {Despite the progress we have recorded in the last few years in multilingual natural language processing, evaluation is typically limited to a small set of languages with available datasets which excludes a large number of low-resource languages. In this paper, we created SIB-200 -- a large-scale open-sourced benchmark dataset for topic classification in 200 languages and dialects to address the lack of evaluation dataset for Natural Language Understanding (NLU). For many of the languages covered in SIB-200, this is the first publicly available evaluation dataset for NLU. The dataset is based on Flores-200 machine translation corpus. We annotated the English portion of the dataset and extended the sentence-level annotation to the remaining 203 languages covered in the corpus. Despite the simplicity of this task, our evaluation in full-supervised setting, cross-lingual transfer setting and prompting of large language model setting show that there is still a large gap between the performance of high-resource and low-resource languages when multilingual evaluation is scaled to numerous world languages. We found that languages unseen during the pre-training of multilingual language models, under-represented language families (like Nilotic and Altantic-Congo), and languages from the regions of Africa, Americas, Oceania and South East Asia, often have the lowest performance on our topic classification dataset. We hope our dataset will encourage a more inclusive evaluation of multilingual language models on a more diverse set of languages. https://github.com/dadelani/sib-200},
	archiveprefix = {arXiv},
	keywords = {dataset,n=200},
	file = {/Users/david/Zotero/storage/UFRJDZRG/Adelani et al. - 2024 - SIB-200 A Simple, Inclusive, and Big Evaluation Dataset for Topic Classification in 200+ Languages.pdf;/Users/david/Zotero/storage/T49BTFIH/2309.html}
	}

	@misc{AfricaNLPCollection,
	title = {{{AfricaNLP Collection}}},
	keywords = {dataset-collection}
	}

	@misc{ahujaMEGAVERSEBenchmarkingLarge2024,
	title = {{{MEGAVERSE}}: {{Benchmarking Large Language Models Across Languages}}, {{Modalities}}, {{Models}} and {{Tasks}}},
	shorttitle = {{{MEGAVERSE}}},
	author = {Ahuja, Sanchit and Aggarwal, Divyanshu and Gumma, Varun and Watts, Ishaan and Sathe, Ashutosh and Ochieng, Millicent and Hada, Rishav and Jain, Prachi and Axmed, Maxamed and Bali, Kalika and Sitaram, Sunayana},
	year = {2024},
	month = apr,
	number = {arXiv:2311.07463},
	eprint = {2311.07463},
	publisher = {arXiv},
	doi = {10.48550/arXiv.2311.07463},
	urldate = {2024-11-02},
	abstract = {There has been a surge in LLM evaluation research to understand LLM capabilities and limitations. However, much of this research has been confined to English, leaving LLM building and evaluation for non-English languages relatively unexplored. Several new LLMs have been introduced recently, necessitating their evaluation on non-English languages. This study aims to perform a thorough evaluation of the non-English capabilities of SoTA LLMs (GPT-3.5-Turbo, GPT-4, PaLM2, Gemini-Pro, Mistral, Llama2, and Gemma) by comparing them on the same set of multilingual datasets. Our benchmark comprises 22 datasets covering 83 languages, including low-resource African languages. We also include two multimodal datasets in the benchmark and compare the performance of LLaVA models, GPT-4-Vision and Gemini-Pro-Vision. Our experiments show that larger models such as GPT-4, Gemini-Pro and PaLM2 outperform smaller models on various tasks, notably on low-resource languages, with GPT-4 outperforming PaLM2 and Gemini-Pro on more datasets. We also perform a study on data contamination and find that several models are likely to be contaminated with multilingual evaluation benchmarks, necessitating approaches to detect and handle contamination while assessing the multilingual performance of LLMs.},
	archiveprefix = {arXiv},
	keywords = {dataset,evaluation,n=83},
	file = {/Users/david/Zotero/storage/Q8A3WGUG/Ahuja et al. - 2024 - MEGAVERSE Benchmarking Large Language Models Across Languages, Modalities, Models and Tasks.pdf;/Users/david/Zotero/storage/ZHA8FR3E/2311.html}
	}

	@misc{aliExpandingFLORESBenchmark2024,
	title = {Expanding {{FLORES}}+ {{Benchmark}} for More {{Low-Resource Settings}}: {{Portuguese-Emakhuwa Machine Translation Evaluation}}},
	shorttitle = {Expanding {{FLORES}}+ {{Benchmark}} for More {{Low-Resource Settings}}},
	author = {Ali, Felermino D. M. Antonio and Cardoso, Henrique Lopes and {Sousa-Silva}, Rui},
	year = {2024},
	month = aug,
	number = {arXiv:2408.11457},
	eprint = {2408.11457},
	primaryclass = {cs},
	publisher = {arXiv},
	doi = {10.48550/arXiv.2408.11457},
	urldate = {2025-05-28},
	abstract = {As part of the Open Language Data Initiative shared tasks, we have expanded the FLORES+ evaluation set to include Emakhuwa, a low-resource language widely spoken in Mozambique. We translated the dev and devtest sets from Portuguese into Emakhuwa, and we detail the translation process and quality assurance measures used. Our methodology involved various quality checks, including post-editing and adequacy assessments. The resulting datasets consist of multiple reference sentences for each source. We present baseline results from training a Neural Machine Translation system and fine-tuning existing multilingual translation models. Our findings suggest that spelling inconsistencies remain a challenge in Emakhuwa. Additionally, the baseline models underperformed on this evaluation set, underscoring the necessity for further research to enhance machine translation quality for Emakhuwa. The data is publicly available at https://huggingface.co/datasets/LIACC/Emakhuwa-FLORES.},
	archiveprefix = {arXiv},
	file = {/Users/david/Zotero/storage/THELPPXB/Ali et al. - 2024 - Expanding FLORES+ Benchmark for more Low-Resource Settings Portuguese-Emakhuwa Machine Translation.pdf;/Users/david/Zotero/storage/NWMT7ZHL/2408.html}
	}

	@misc{bapnaBuildingMachineTranslation2022,
	title = {Building {{Machine Translation Systems}} for the {{Next Thousand Languages}}},
	author = {Bapna, Ankur and Caswell, Isaac and Kreutzer, Julia and Firat, Orhan and van Esch, Daan and Siddhant, Aditya and Niu, Mengmeng and Baljekar, Pallavi and Garcia, Xavier and Macherey, Wolfgang and Breiner, Theresa and Axelrod, Vera and Riesa, Jason and Cao, Yuan and Chen, Mia Xu and Macherey, Klaus and Krikun, Maxim and Wang, Pidong and Gutkin, Alexander and Shah, Apurva and Huang, Yanping and Chen, Zhifeng and Wu, Yonghui and Hughes, Macduff},
	year = {2022},
	month = jul,
	number = {arXiv:2205.03983},
	eprint = {2205.03983},
	publisher = {arXiv},
	doi = {10.48550/arXiv.2205.03983},
	urldate = {2024-11-02},
	abstract = {In this paper we share findings from our effort to build practical machine translation (MT) systems capable of translating across over one thousand languages. We describe results in three research domains: (i) Building clean, web-mined datasets for 1500+ languages by leveraging semi-supervised pre-training for language identification and developing data-driven filtering techniques; (ii) Developing practical MT models for under-served languages by leveraging massively multilingual models trained with supervised parallel data for over 100 high-resource languages and monolingual datasets for an additional 1000+ languages; and (iii) Studying the limitations of evaluation metrics for these languages and conducting qualitative analysis of the outputs from our MT models, highlighting several frequent error modes of these types of models. We hope that our work provides useful insights to practitioners working towards building MT systems for currently understudied languages, and highlights research directions that can complement the weaknesses of massively multilingual models in data-sparse settings.},
	archiveprefix = {arXiv},
	keywords = {dataset,model,n=1500},
	file = {/Users/david/Zotero/storage/YCW6FWWE/Bapna et al. - 2022 - Building Machine Translation Systems for the Next Thousand Languages.pdf;/Users/david/Zotero/storage/EL7PA6YJ/2205.html}
	}

	@article{costa-jussaScalingNeuralMachine2024a,
	title = {Scaling Neural Machine Translation to 200 Languages},
	author = {{Costa-juss{\`a}}, Marta R. and Cross, James and {\c C}elebi, Onur and Elbayad, Maha and Heafield, Kenneth and Heffernan, Kevin and Kalbassi, Elahe and Lam, Janice and Licht, Daniel and Maillard, Jean and Sun, Anna and Wang, Skyler and Wenzek, Guillaume and Youngblood, Al and Akula, Bapi and Barrault, Loic and Gonzalez, Gabriel Mejia and Hansanti, Prangthip and Hoffman, John and Jarrett, Semarley and Sadagopan, Kaushik Ram and Rowe, Dirk and Spruit, Shannon and Tran, Chau and Andrews, Pierre and Ayan, Necip Fazil and Bhosale, Shruti and Edunov, Sergey and Fan, Angela and Gao, Cynthia and Goswami, Vedanuj and Guzm{\'a}n, Francisco and Koehn, Philipp and Mourachko, Alexandre and Ropers, Christophe and Saleem, Safiyyah and Schwenk, Holger and Wang, Jeff and {NLLB Team}},
	year = {2024},
	month = jun,
	journal = {Nature},
	volume = {630},
	number = {8018},
	pages = {841--846},
	publisher = {Nature Publishing Group},
	issn = {1476-4687},
	doi = {10.1038/s41586-024-07335-x},
	urldate = {2025-05-28},
	abstract = {The development of neural techniques has opened up new avenues for research in machine translation. Today, neural machine translation (NMT) systems can leverage highly multilingual capacities and even perform zero-shot translation, delivering promising results in terms of language coverage and quality. However, scaling quality NMT requires large volumes of parallel bilingual data, which are not equally available for the 7,000+ languages in the world1. Focusing on improving the translation qualities of a relatively small group of high-resource languages comes at the expense of directing research attention to low-resource languages, exacerbating digital inequities in the long run. To break this pattern, here we introduce No Language Left Behind---a single massively multilingual model that leverages transfer learning across languages. We developed a conditional computational model based on the Sparsely Gated Mixture of Experts architecture2--7, which we trained on data obtained with new mining techniques tailored for low-resource languages. Furthermore, we devised multiple architectural and training improvements to counteract overfitting while training on thousands of tasks. We evaluated the performance of our model over 40,000 translation directions using tools created specifically for this purpose---an automatic benchmark (FLORES-200), a human evaluation metric (XSTS) and a toxicity detector that covers every language in our model. Compared with the previous state-of-the-art models, our model achieves an average of 44\% improvement in translation quality as measured by BLEU. By demonstrating how to scale NMT to 200 languages and making all contributions in this effort freely available for non-commercial use, our work lays important groundwork for the development of a universal translation system.},
	copyright = {2024 Meta},
	langid = {english},
	file = {/Users/david/Zotero/storage/6RWFDDH5/Costa-jussà et al. - 2024 - Scaling neural machine translation to 200 languages.pdf}
	}

	@book{eberhard2024ethnologue,
	title = {Ethnologue: {{Languages}} of the World},
	editor = {Eberhard, David M. and Simons, Gary F. and Fennig, Charles D.},
	year = {2024},
	edition = {27},
	publisher = {SIL International},
	address = {Dallas, Texas}
	}

	@inproceedings{federmannNTREX128NewsTest2022,
	title = {{{NTREX-128}} -- {{News Test References}} for {{MT Evaluation}} of 128 {{Languages}}},
	booktitle = {Proceedings of the {{First Workshop}} on {{Scaling Up Multilingual Evaluation}}},
	author = {Federmann, Christian and Kocmi, Tom and Xin, Ying},
	editor = {Ahuja, Kabir and Anastasopoulos, Antonios and Patra, Barun and Neubig, Graham and Choudhury, Monojit and Dandapat, Sandipan and Sitaram, Sunayana and Chaudhary, Vishrav},
	year = {2022},
	month = nov,
	pages = {21--24},
	publisher = {Association for Computational Linguistics},
	address = {Online},
	urldate = {2024-11-02},
	keywords = {dataset,n=128},
	file = {/Users/david/Zotero/storage/E286EDPU/Federmann et al. - 2022 - NTREX-128 – News Test References for MT Evaluation of 128 Languages.pdf}
	}

	@inproceedings{gordeevFLORESTranslationMachine2024,
	title = {{{FLORES}}+ {{Translation}} and {{Machine Translation Evaluation}} for the {{Erzya Language}}},
	booktitle = {Proceedings of the {{Ninth Conference}} on {{Machine Translation}}},
	author = {Gordeev, Isai and Kuldin, Sergey and Dale, David},
	editor = {Haddow, Barry and Kocmi, Tom and Koehn, Philipp and Monz, Christof},
	year = {2024},
	month = nov,
	pages = {614--623},
	publisher = {Association for Computational Linguistics},
	address = {Miami, Florida, USA},
	doi = {10.18653/v1/2024.wmt-1.49},
	urldate = {2025-05-28},
	abstract = {This paper introduces a translation of the FLORES+ dataset into the endangered Erzya language, with the goal of evaluating machine translation between this language and any of the other 200 languages already included into FLORES+. This translation was carried out as a part of the Open Language Data shared task at WMT24. We also present a benchmark of existing translation models bases on this dataset and a new translation model that achieves the state-of-the-art quality of translation into Erzya from Russian and English.},
	file = {/Users/david/Zotero/storage/KHZ84I94/Gordeev et al. - 2024 - FLORES+ Translation and Machine Translation Evaluation for the Erzya Language.pdf}
	}

	@misc{gurgurovLowREmRepositoryWord2024,
	title = {{{LowREm}}: {{A Repository}} of {{Word Embeddings}} for 87 {{Low-Resource Languages Enhanced}} with {{Multilingual Graph Knowledge}}},
	shorttitle = {{{LowREm}}},
	author = {Gurgurov, Daniil and Kumar, Rishu and Ostermann, Simon},
	year = {2024},
	month = sep,
	number = {arXiv:2409.18193},
	eprint = {2409.18193},
	publisher = {arXiv},
	doi = {10.48550/arXiv.2409.18193},
	urldate = {2024-11-02},
	abstract = {Contextualized embeddings based on large language models (LLMs) are available for various languages, but their coverage is often limited for lower resourced languages. Training LLMs for such languages is often difficult due to insufficient data and high computational cost. Especially for very low resource languages, static word embeddings thus still offer a viable alternative. There is, however, a notable lack of comprehensive repositories with such embeddings for diverse languages. To address this, we present LowREm, a centralized repository of static embeddings for 87 low-resource languages. We also propose a novel method to enhance GloVe-based embeddings by integrating multilingual graph knowledge, utilizing another source of knowledge. We demonstrate the superior performance of our enhanced embeddings as compared to contextualized embeddings extracted from XLM-R on sentiment analysis. Our code and data are publicly available under https://huggingface.co/DFKI.},
	archiveprefix = {arXiv},
	keywords = {model,n=87},
	file = {/Users/david/Zotero/storage/CGG3Y22P/Gurgurov et al. - 2024 - LowREm A Repository of Word Embeddings for 87 Low-Resource Languages Enhanced with Multilingual Gra.pdf;/Users/david/Zotero/storage/TJLLL6RT/2409.html}
	}

	@misc{HPLTDatasetsV2,
	title = {{{HPLT Datasets}} V2},
	urldate = {2024-11-02},
	keywords = {dataset,n=193},
	file = {/Users/david/Zotero/storage/UFLXIHNC/v2.html}
	}

	@inproceedings{joshiStateFateLinguistic2020,
	title = {The {{State}} and {{Fate}} of {{Linguistic Diversity}} and {{Inclusion}} in the {{NLP World}}},
	booktitle = {Proceedings of the 58th {{Annual Meeting}} of the {{Association}} for {{Computational Linguistics}}},
	author = {Joshi, Pratik and Santy, Sebastin and Budhiraja, Amar and Bali, Kalika and Choudhury, Monojit},
	year = {2020},
	pages = {6282--6293},
	publisher = {Association for Computational Linguistics},
	address = {Online},
	doi = {10.18653/v1/2020.acl-main.560},
	urldate = {2024-11-02},
	langid = {english},
	keywords = {evaluation,n=2500},
	file = {/Users/david/Zotero/storage/TDKP4GV9/Joshi et al. - 2020 - The State and Fate of Linguistic Diversity and Inclusion in the NLP World.pdf}
	}

	@misc{LacunaLanguageDatasets,
	title = {Lacuna {{Language Datasets}}},
	urldate = {2024-11-02},
	langid = {american},
	keywords = {dataset-collection},
	file = {/Users/david/Zotero/storage/SMRV9HE2/language.html}
	}

	@misc{laiOkapiInstructiontunedLarge2023,
	title = {Okapi: {{Instruction-tuned Large Language Models}} in {{Multiple Languages}} with {{Reinforcement Learning}} from {{Human Feedback}}},
	shorttitle = {Okapi},
	author = {Lai, Viet Dac and Nguyen, Chien Van and Ngo, Nghia Trung and Nguyen, Thuat and Dernoncourt, Franck and Rossi, Ryan A. and Nguyen, Thien Huu},
	year = {2023},
	month = aug,
	number = {arXiv:2307.16039},
	eprint = {2307.16039},
	primaryclass = {cs},
	publisher = {arXiv},
	doi = {10.48550/arXiv.2307.16039},
	urldate = {2025-05-28},
	abstract = {A key technology for the development of large language models (LLMs) involves instruction tuning that helps align the models' responses with human expectations to realize impressive learning abilities. Two major approaches for instruction tuning characterize supervised fine-tuning (SFT) and reinforcement learning from human feedback (RLHF), which are currently applied to produce the best commercial LLMs (e.g., ChatGPT). To improve the accessibility of LLMs for research and development efforts, various instruction-tuned open-source LLMs have also been introduced recently, e.g., Alpaca, Vicuna, to name a few. However, existing open-source LLMs have only been instruction-tuned for English and a few popular languages, thus hindering their impacts and accessibility to many other languages in the world. Among a few very recent work to explore instruction tuning for LLMs in multiple languages, SFT has been used as the only approach to instruction-tune LLMs for multiple languages. This has left a significant gap for fine-tuned LLMs based on RLHF in diverse languages and raised important questions on how RLHF can boost the performance of multilingual instruction tuning. To overcome this issue, we present Okapi, the first system with instruction-tuned LLMs based on RLHF for multiple languages. Okapi introduces instruction and response-ranked data in 26 diverse languages to facilitate the experiments and development of future multilingual LLM research. We also present benchmark datasets to enable the evaluation of generative LLMs in multiple languages. Our experiments demonstrate the advantages of RLHF for multilingual instruction over SFT for different base models and datasets. Our framework and resources are released at https://github.com/nlp-uoregon/Okapi.},
	archiveprefix = {arXiv},
	file = {/Users/david/Zotero/storage/2GB79E4F/Lai et al. - 2023 - Okapi Instruction-tuned Large Language Models in Multiple Languages with Reinforcement Learning fro.pdf;/Users/david/Zotero/storage/VZXBL7F2/2307.html}
	}

	@misc{Lanfrica,
	title = {Lanfrica},
	urldate = {2024-11-02},
	abstract = {Lanfrica catalogues, archives and links African language resources in order to mitigate the difficulty encountered in discovering African works.},
	howpublished = {https://lanfrica.com/records},
	langid = {english}
	}

	@inproceedings{maillardSmallDataBig2023,
	title = {Small {{Data}}, {{Big Impact}}: {{Leveraging Minimal Data}} for {{Effective Machine Translation}}},
	shorttitle = {Small {{Data}}, {{Big Impact}}},
	booktitle = {Proceedings of the 61st {{Annual Meeting}} of the {{Association}} for {{Computational Linguistics}} ({{Volume}} 1: {{Long Papers}})},
	author = {Maillard, Jean and Gao, Cynthia and Kalbassi, Elahe and Sadagopan, Kaushik Ram and Goswami, Vedanuj and Koehn, Philipp and Fan, Angela and Guzman, Francisco},
	editor = {Rogers, Anna and {Boyd-Graber}, Jordan and Okazaki, Naoaki},
	year = {2023},
	month = jul,
	pages = {2740--2756},
	publisher = {Association for Computational Linguistics},
	address = {Toronto, Canada},
	doi = {10.18653/v1/2023.acl-long.154},
	urldate = {2024-11-02},
	abstract = {For many languages, machine translation progress is hindered by the lack of reliable training data. Models are trained on whatever pre-existing datasets may be available and then augmented with synthetic data, because it is often not economical to pay for the creation of large-scale datasets. But for the case of low-resource languages, would the creation of a few thousand professionally translated sentence pairs give any benefit? In this paper, we show that it does. We describe a broad data collection effort involving around 6k professionally translated sentence pairs for each of 39 low-resource languages, which we make publicly available. We analyse the gains of models trained on this small but high-quality data, showing that it has significant impact even when larger but lower quality pre-existing corpora are used, or when data is augmented with millions of sentences through backtranslation.},
	keywords = {dataset,n=39},
	file = {/Users/david/Zotero/storage/6BYYZ7V2/Maillard et al. - 2023 - Small Data, Big Impact Leveraging Minimal Data for Effective Machine Translation.pdf}
	}

	@misc{MasakhaneAfrimmluDatasets2024,
	title = {Masakhane/Afrimmlu {$\cdot$} {{Datasets}} at {{Hugging Face}}},
	year = {2024},
	month = jun,
	urldate = {2025-05-28},
	abstract = {We're on a journey to advance and democratize artificial intelligence through open source and open science.},
	howpublished = {https://huggingface.co/datasets/masakhane/afrimmlu}
	}

	@inproceedings{nekotoParticipatoryResearchLowresourced2020,
	title = {Participatory {{Research}} for {{Low-resourced Machine Translation}}: {{A Case Study}} in {{African Languages}}},
	shorttitle = {Participatory {{Research}} for {{Low-resourced Machine Translation}}},
	booktitle = {Findings of the {{Association}} for {{Computational Linguistics}}: {{EMNLP}} 2020},
	author = {Nekoto, Wilhelmina and Marivate, Vukosi and Matsila, Tshinondiwa and Fasubaa, Timi and Fagbohungbe, Taiwo and Akinola, Solomon Oluwole and Muhammad, Shamsuddeen and Kabongo Kabenamualu, Salomon and Osei, Salomey and Sackey, Freshia and Niyongabo, Rubungo Andre and Macharm, Ricky and Ogayo, Perez and Ahia, Orevaoghene and Berhe, Musie Meressa and Adeyemi, Mofetoluwa and {Mokgesi-Selinga}, Masabata and Okegbemi, Lawrence and Martinus, Laura and Tajudeen, Kolawole and Degila, Kevin and Ogueji, Kelechi and Siminyu, Kathleen and Kreutzer, Julia and Webster, Jason and Ali, Jamiil Toure and Abbott, Jade and Orife, Iroro and Ezeani, Ignatius and Dangana, Idris Abdulkadir and Kamper, Herman and Elsahar, Hady and Duru, Goodness and Kioko, Ghollah and Espoir, Murhabazi and {van Biljon}, Elan and Whitenack, Daniel and Onyefuluchi, Christopher and Emezue, Chris Chinenye and Dossou, Bonaventure F. P. and Sibanda, Blessing and Bassey, Blessing and Olabiyi, Ayodele and Ramkilowan, Arshath and {\"O}ktem, Alp and Akinfaderin, Adewale and Bashir, Abdallah},
	editor = {Cohn, Trevor and He, Yulan and Liu, Yang},
	year = {2020},
	month = nov,
	pages = {2144--2160},
	publisher = {Association for Computational Linguistics},
	address = {Online},
	doi = {10.18653/v1/2020.findings-emnlp.195},
	urldate = {2024-11-02},
	abstract = {Research in NLP lacks geographic diversity, and the question of how NLP can be scaled to low-resourced languages has not yet been adequately solved. `Low-resourced'-ness is a complex problem going beyond data availability and reflects systemic problems in society. In this paper, we focus on the task of Machine Translation (MT), that plays a crucial role for information accessibility and communication worldwide. Despite immense improvements in MT over the past decade, MT is centered around a few high-resourced languages. As MT researchers cannot solve the problem of low-resourcedness alone, we propose participatory research as a means to involve all necessary agents required in the MT development process. We demonstrate the feasibility and scalability of participatory research with a case study on MT for African languages. Its implementation leads to a collection of novel translation datasets, MT benchmarks for over 30 languages, with human evaluations for a third of them, and enables participants without formal training to make a unique scientific contribution. Benchmarks, models, data, code, and evaluation results are released at https://github.com/masakhane-io/masakhane-mt.},
	keywords = {dataset-collection,n=30},
	file = {/Users/david/Zotero/storage/JJ2S8CT3/Nekoto et al. - 2020 - Participatory Research for Low-resourced Machine Translation A Case Study in African Languages.pdf}
	}

	@misc{OpenaiMMMLUDatasets2024,
	title = {Openai/{{MMMLU}} {$\cdot$} {{Datasets}} at {{Hugging Face}}},
	year = {2024},
	month = oct,
	urldate = {2025-05-28},
	abstract = {We're on a journey to advance and democratize artificial intelligence through open source and open science.},
	howpublished = {https://huggingface.co/datasets/openai/MMMLU},
	file = {/Users/david/Zotero/storage/LPEN8SFL/MMMLU.html}
	}

	@misc{OpenlanguagedataFlores2024,
	title = {Openlanguagedata/Flores},
	year = {2024},
	month = nov,
	urldate = {2024-11-02},
	abstract = {The FLORES+ Machine Translation Benchmark},
	copyright = {CC-BY-SA-4.0},
	howpublished = {openlanguagedata},
	keywords = {dataset,n=200}
	}

	@misc{petrovLanguageModelTokenizers2023,
	title = {Language {{Model Tokenizers Introduce Unfairness Between Languages}}},
	author = {Petrov, Aleksandar and Malfa, Emanuele La and Torr, Philip H. S. and Bibi, Adel},
	year = {2023},
	month = oct,
	number = {arXiv:2305.15425},
	eprint = {2305.15425},
	primaryclass = {cs},
	publisher = {arXiv},
	doi = {10.48550/arXiv.2305.15425},
	urldate = {2025-03-24},
	abstract = {Recent language models have shown impressive multilingual performance, even when not explicitly trained for it. Despite this, there are concerns about the quality of their outputs across different languages. In this paper, we show how disparity in the treatment of different languages arises at the tokenization stage, well before a model is even invoked. The same text translated into different languages can have drastically different tokenization lengths, with differences up to 15 times in some cases. These disparities persist even for tokenizers that are intentionally trained for multilingual support. Character-level and byte-level models also exhibit over 4 times the difference in the encoding length for some language pairs. This induces unfair treatment for some language communities in regard to the cost of accessing commercial language services, the processing time and latency, as well as the amount of content that can be provided as context to the models. Therefore, we make the case that we should train future language models using multilingually fair subword tokenizers.},
	archiveprefix = {arXiv},
	file = {/Users/david/Zotero/storage/NX3DT98Z/Petrov et al. - 2023 - Language Model Tokenizers Introduce Unfairness Between Languages.pdf;/Users/david/Zotero/storage/FKJQP22D/2305.html}
	}

	@inproceedings{robinsonChatGPTMTCompetitive2023,
	title = {{{ChatGPT MT}}: {{Competitive}} for {{High-}} (but {{Not Low-}}) {{Resource Languages}}},
	shorttitle = {{{ChatGPT MT}}},
	booktitle = {Proceedings of the {{Eighth Conference}} on {{Machine Translation}}},
	author = {Robinson, Nathaniel and Ogayo, Perez and Mortensen, David R. and Neubig, Graham},
	editor = {Koehn, Philipp and Haddow, Barry and Kocmi, Tom and Monz, Christof},
	year = {2023},
	month = dec,
	pages = {392--418},
	publisher = {Association for Computational Linguistics},
	address = {Singapore},
	doi = {10.18653/v1/2023.wmt-1.40},
	urldate = {2024-11-02},
	abstract = {Large language models (LLMs) implicitly learn to perform a range of language tasks, including machine translation (MT). Previous studies explore aspects of LLMs' MT capabilities. However, there exist a wide variety of languages for which recent LLM MT performance has never before been evaluated. Without published experimental evidence on the matter, it is difficult for speakers of the world's diverse languages to know how and whether they can use LLMs for their languages. We present the first experimental evidence for an expansive set of 204 languages, along with MT cost analysis, using the FLORES-200 benchmark. Trends reveal that GPT models approach or exceed traditional MT model performance for some high-resource languages (HRLs) but consistently lag for low-resource languages (LRLs), under-performing traditional MT for 84.1\% of languages we covered. Our analysis reveals that a language's resource level is the most important feature in determining ChatGPT's relative ability to translate it, and suggests that ChatGPT is especially disadvantaged for LRLs and African languages.},
	keywords = {evaluation,n=200},
	file = {/Users/david/Zotero/storage/BWFBTAZ9/Robinson et al. - 2023 - ChatGPT MT Competitive for High- (but Not Low-) Resource Languages.pdf}
	}

	@misc{shiLanguageModelsAre2022,
	title = {Language {{Models}} Are {{Multilingual Chain-of-Thought Reasoners}}},
	author = {Shi, Freda and Suzgun, Mirac and Freitag, Markus and Wang, Xuezhi and Srivats, Suraj and Vosoughi, Soroush and Chung, Hyung Won and Tay, Yi and Ruder, Sebastian and Zhou, Denny and Das, Dipanjan and Wei, Jason},
	year = {2022},
	month = oct,
	number = {arXiv:2210.03057},
	eprint = {2210.03057},
	primaryclass = {cs},
	publisher = {arXiv},
	doi = {10.48550/arXiv.2210.03057},
	urldate = {2025-05-28},
	abstract = {We evaluate the reasoning abilities of large language models in multilingual settings. We introduce the Multilingual Grade School Math (MGSM) benchmark, by manually translating 250 grade-school math problems from the GSM8K dataset (Cobbe et al., 2021) into ten typologically diverse languages. We find that the ability to solve MGSM problems via chain-of-thought prompting emerges with increasing model scale, and that models have strikingly strong multilingual reasoning abilities, even in underrepresented languages such as Bengali and Swahili. Finally, we show that the multilingual reasoning abilities of language models extend to other tasks such as commonsense reasoning and word-in-context semantic judgment. The MGSM benchmark is publicly available at https://github.com/google-research/url-nlp.},
	archiveprefix = {arXiv},
	file = {/Users/david/Zotero/storage/3W9ATYCI/Shi et al. - 2022 - Language Models are Multilingual Chain-of-Thought Reasoners.pdf;/Users/david/Zotero/storage/4HED3DGQ/2210.html}
	}

	@misc{siminyuAI4DAfricanLanguage2021,
	title = {{{AI4D}} -- {{African Language Program}}},
	author = {Siminyu, Kathleen and Kalipe, Godson and Orlic, Davor and Abbott, Jade and Marivate, Vukosi and Freshia, Sackey and Sibal, Prateek and Neupane, Bhanu and Adelani, David I. and Taylor, Amelia and ALI, Jamiil Toure and Degila, Kevin and Balogoun, Momboladji and DIOP, Thierno Ibrahima and David, Davis and Fourati, Chayma and Haddad, Hatem and Naski, Malek},
	year = {2021},
	month = apr,
	number = {arXiv:2104.02516},
	eprint = {2104.02516},
	publisher = {arXiv},
	doi = {10.48550/arXiv.2104.02516},
	urldate = {2024-11-02},
	abstract = {Advances in speech and language technologies enable tools such as voice-search, text-to-speech, speech recognition and machine translation. These are however only available for high resource languages like English, French or Chinese. Without foundational digital resources for African languages, which are considered low-resource in the digital context, these advanced tools remain out of reach. This work details the AI4D - African Language Program, a 3-part project that 1) incentivised the crowd-sourcing, collection and curation of language datasets through an online quantitative and qualitative challenge, 2) supported research fellows for a period of 3-4 months to create datasets annotated for NLP tasks, and 3) hosted competitive Machine Learning challenges on the basis of these datasets. Key outcomes of the work so far include 1) the creation of 9+ open source, African language datasets annotated for a variety of ML tasks, and 2) the creation of baseline models for these datasets through hosting of competitive ML challenges.},
	archiveprefix = {arXiv},
	keywords = {dataset-collection},
	file = {/Users/david/Zotero/storage/VU6IFENR/Siminyu et al. - 2021 - AI4D -- African Language Program.pdf;/Users/david/Zotero/storage/7TV2PS8J/2104.html}
	}

	@misc{singhGlobalMMLUUnderstanding2025,
	title = {Global {{MMLU}}: {{Understanding}} and {{Addressing Cultural}} and {{Linguistic Biases}} in {{Multilingual Evaluation}}},
	shorttitle = {Global {{MMLU}}},
	author = {Singh, Shivalika and Romanou, Angelika and Fourrier, Cl{\'e}mentine and Adelani, David I. and Ngui, Jian Gang and {Vila-Suero}, Daniel and Limkonchotiwat, Peerat and Marchisio, Kelly and Leong, Wei Qi and Susanto, Yosephine and Ng, Raymond and Longpre, Shayne and Ko, Wei-Yin and Ruder, Sebastian and Smith, Madeline and Bosselut, Antoine and Oh, Alice and Martins, Andre F. T. and Choshen, Leshem and Ippolito, Daphne and Ferrante, Enzo and Fadaee, Marzieh and Ermis, Beyza and Hooker, Sara},
	year = {2025},
	month = feb,
	number = {arXiv:2412.03304},
	eprint = {2412.03304},
	primaryclass = {cs},
	publisher = {arXiv},
	doi = {10.48550/arXiv.2412.03304},
	urldate = {2025-05-28},
	abstract = {Cultural biases in multilingual datasets pose significant challenges for their effectiveness as global benchmarks. These biases stem not only from differences in language but also from the cultural knowledge required to interpret questions, reducing the practical utility of translated datasets like MMLU. Furthermore, translation often introduces artefacts that can distort the meaning or clarity of questions in the target language. A common practice in multilingual evaluation is to rely on machine-translated evaluation sets, but simply translating a dataset is insufficient to address these challenges. In this work, we trace the impact of both of these issues on multilingual evaluations and ensuing model performances. Our large-scale evaluation of state-of-the-art open and proprietary models illustrates that progress on MMLU depends heavily on learning Western-centric concepts, with 28\% of all questions requiring culturally sensitive knowledge. Moreover, for questions requiring geographic knowledge, an astounding 84.9\% focus on either North American or European regions. Rankings of model evaluations change depending on whether they are evaluated on the full portion or the subset of questions annotated as culturally sensitive, showing the distortion to model rankings when blindly relying on translated MMLU. We release Global MMLU, an improved MMLU with evaluation coverage across 42 languages -- with improved overall quality by engaging with compensated professional and community annotators to verify translation quality while also rigorously evaluating cultural biases present in the original dataset. This comprehensive Global MMLU set also includes designated subsets labeled as culturally sensitive and culturally agnostic to allow for more holistic, complete evaluation.},
	archiveprefix = {arXiv},
	file = {/Users/david/Zotero/storage/23LXUKPU/Singh et al. - 2025 - Global MMLU Understanding and Addressing Cultural and Linguistic Biases in Multilingual Evaluation.pdf;/Users/david/Zotero/storage/563VJ87N/2412.html}
	}

	@misc{Tatoeba,
	title = {Tatoeba},
	urldate = {2024-11-03},
	file = {/Users/david/Zotero/storage/4NDTCGWG/sentences_by_language.html}
	}

	@misc{teamNoLanguageLeft2022,
	title = {No {{Language Left Behind}}: {{Scaling Human-Centered Machine Translation}}},
	shorttitle = {No {{Language Left Behind}}},
	author = {Team, {\relax NLLB} and {Costa-juss{\`a}}, Marta R. and Cross, James and {\c C}elebi, Onur and Elbayad, Maha and Heafield, Kenneth and Heffernan, Kevin and Kalbassi, Elahe and Lam, Janice and Licht, Daniel and Maillard, Jean and Sun, Anna and Wang, Skyler and Wenzek, Guillaume and Youngblood, Al and Akula, Bapi and Barrault, Loic and Gonzalez, Gabriel Mejia and Hansanti, Prangthip and Hoffman, John and Jarrett, Semarley and Sadagopan, Kaushik Ram and Rowe, Dirk and Spruit, Shannon and Tran, Chau and Andrews, Pierre and Ayan, Necip Fazil and Bhosale, Shruti and Edunov, Sergey and Fan, Angela and Gao, Cynthia and Goswami, Vedanuj and Guzm{\'a}n, Francisco and Koehn, Philipp and Mourachko, Alexandre and Ropers, Christophe and Saleem, Safiyyah and Schwenk, Holger and Wang, Jeff},
	year = {2022},
	month = aug,
	number = {arXiv:2207.04672},
	eprint = {2207.04672},
	publisher = {arXiv},
	doi = {10.48550/arXiv.2207.04672},
	urldate = {2024-11-02},
	abstract = {Driven by the goal of eradicating language barriers on a global scale, machine translation has solidified itself as a key focus of artificial intelligence research today. However, such efforts have coalesced around a small subset of languages, leaving behind the vast majority of mostly low-resource languages. What does it take to break the 200 language barrier while ensuring safe, high quality results, all while keeping ethical considerations in mind? In No Language Left Behind, we took on this challenge by first contextualizing the need for low-resource language translation support through exploratory interviews with native speakers. Then, we created datasets and models aimed at narrowing the performance gap between low and high-resource languages. More specifically, we developed a conditional compute model based on Sparsely Gated Mixture of Experts that is trained on data obtained with novel and effective data mining techniques tailored for low-resource languages. We propose multiple architectural and training improvements to counteract overfitting while training on thousands of tasks. Critically, we evaluated the performance of over 40,000 different translation directions using a human-translated benchmark, Flores-200, and combined human evaluation with a novel toxicity benchmark covering all languages in Flores-200 to assess translation safety. Our model achieves an improvement of 44\% BLEU relative to the previous state-of-the-art, laying important groundwork towards realizing a universal translation system. Finally, we open source all contributions described in this work, accessible at https://github.com/facebookresearch/fairseq/tree/nllb.},
	archiveprefix = {arXiv},
	keywords = {dataset,model,n=200},
	file = {/Users/david/Zotero/storage/GHWEGFFS/Team et al. - 2022 - No Language Left Behind Scaling Human-Centered Machine Translation.pdf;/Users/david/Zotero/storage/SZK3CP9C/2207.html}
	}

	@misc{thellmannMultilingualLLMEvaluation2024,
	title = {Towards {{Multilingual LLM Evaluation}} for {{European Languages}}},
	author = {Thellmann, Klaudia and Stadler, Bernhard and Fromm, Michael and Buschhoff, Jasper Schulze and Jude, Alex and Barth, Fabio and Leveling, Johannes and {Flores-Herr}, Nicolas and K{\"o}hler, Joachim and J{\"a}kel, Ren{\'e} and Ali, Mehdi},
	year = {2024},
	month = oct,
	number = {arXiv:2410.08928},
	eprint = {2410.08928},
	primaryclass = {cs},
	publisher = {arXiv},
	doi = {10.48550/arXiv.2410.08928},
	urldate = {2025-05-28},
	abstract = {The rise of Large Language Models (LLMs) has revolutionized natural language processing across numerous languages and tasks. However, evaluating LLM performance in a consistent and meaningful way across multiple European languages remains challenging, especially due to the scarcity of language-parallel multilingual benchmarks. We introduce a multilingual evaluation approach tailored for European languages. We employ translated versions of five widely-used benchmarks to assess the capabilities of 40 LLMs across 21 European languages. Our contributions include examining the effectiveness of translated benchmarks, assessing the impact of different translation services, and offering a multilingual evaluation framework for LLMs that includes newly created datasets: EU20-MMLU, EU20-HellaSwag, EU20-ARC, EU20-TruthfulQA, and EU20-GSM8K. The benchmarks and results are made publicly available to encourage further research in multilingual LLM evaluation.},
	archiveprefix = {arXiv},
	file = {/Users/david/Zotero/storage/DCEINCKD/Thellmann et al. - 2024 - Towards Multilingual LLM Evaluation for European Languages.pdf;/Users/david/Zotero/storage/EAQS33RW/2410.html}
	}