Evaluating Open-Source Large Language Models for Synthetic Non-English Medical Data Generation Using Prompt-Based Techniques
Abstract
Using synthetic data sets to train medicine-focused machine learning models has been shown to enhance their performance; however, most research focuses on English texts. In this paper, we explore generating non-English synthetic medical texts. We propose a methodology for generating medical synthetic data, showcasing it by generating medical texts written in a non-English mixed language. We evaluate our approach with thirteen different language models that are open-source and proprietary, and assess the quality of the data sets in two ways: performing a statistical comparison between the original data set and the generated data sets, and training a classifier to distinguish between original and synthetic examples. The Llama-3.2-3B model achieves the best F1 score of 0.821 ± 0.007 and accuracy of 0.816 ± 0.016, making it most suitable for generating indistinguishable medical synthetic data. In contrast, models like Aya-23, Phi-3, and SmoLLM variants achieve high F1 scores (0.945–0.948), indicating their synthetic data is easily distinguishable from original data. These findings highlight the importance of model selection when generating synthetic medical data sets in non-English languages.
Full Text:
PDFReferences
@misc{patel2024datadreamertoolsyntheticdata,
title = "{D}ata{D}reamer: A Tool for Synthetic Data Generation and Reproducible {LLM} Workflows",
author = "Patel, Ajay and
Raffel, Colin and
Callison-Burch, Chris",
editor = "Ku, Lun-Wei and
Martins, Andre and
Srikumar, Vivek",
booktitle = "Proceedings of the 62nd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)",
month = aug,
year = "2024",
address = "Bangkok, Thailand",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.acl-long.208",
doi = "10.18653/v1/2024.acl-long.208",
pages = "3781--3799",
}
@misc{nllbteam2022languageleftbehindscaling,
title={No Language Left Behind: Scaling Human-Centered Machine Translation},
author={NLLB Team and Marta R. Costa-jussà and James Cross and Onur Çelebi and Maha Elbayad and Kenneth Heafield and Kevin Heffernan and Elahe Kalbassi and Janice Lam and Daniel Licht and Jean Maillard and Anna Sun and Skyler Wang and Guillaume Wenzek and Al Youngblood and Bapi Akula and Loic Barrault and Gabriel Mejia Gonzalez and Prangthip Hansanti and John Hoffman and Semarley Jarrett and Kaushik Ram Sadagopan and Dirk Rowe and Shannon Spruit and Chau Tran and Pierre Andrews and Necip Fazil Ayan and Shruti Bhosale and Sergey Edunov and Angela Fan and Cynthia Gao and Vedanuj Goswami and Francisco Guzmán and Philipp Koehn and Alexandre Mourachko and Christophe Ropers and Safiyyah Saleem and Holger Schwenk and Jeff Wang},
year={2022},
eprint={2207.04672},
archivePrefix={arXiv},
primaryClass={cs.CL},
}
@article{llama3modelcard,
title={Llama 3 Model Card},
author={AI@Meta},
year={2024},
url = {https://github.com/meta-llama/llama3/blob/main/MODEL_CARD.md}
}
@ARTICLE{Llama-Team2024-vy,
title = "The Llama 3 Herd of Models",
author = "AI Llama Team",
journal = "arXiv [cs.CL]",
year = 2024,
doi = "10.48550/arxiv.2309.03882"
}
@INPROCEEDINGS{huggingface_transformers,
title = "Transformers: State-of-the-art natural language processing",
author = "Wolf, Thomas and Debut, Lysandre and Sanh, Victor and Chaumond,
Julien and Delangue, Clement and Moi, Anthony and Cistac, Pierric
and Rault, Tim and Louf, Remi and Funtowicz, Morgan and Davison,
Joe and Shleifer, Sam and von Platen, Patrick and Ma, Clara and
Jernite, Yacine and Plu, Julien and Xu, Canwen and Le Scao, Teven
and Gugger, Sylvain and Drame, Mariama and Lhoest, Quentin and
Rush, Alexander",
booktitle = "Proceedings of the 2020 Conference on Empirical Methods in
Natural Language Processing: System Demonstrations",
publisher = "Association for Computational Linguistics",
pages = "38--45",
year = 2020,
doi = "10.18653/v1/2020.emnlp-demos.6"
}
@misc{nllb-200-distilled-600m,
author = {Facebook AI},
title = {{NLLB-200 Distilled 600M}},
year = {2022},
publisher = {Hugging Face},
journal = {Hugging Face Model Hub},
howpublished = {url{https://huggingface.co/facebook/nllb-200-distilled-600M}},
note = {Accessed: 2024-07-01}
}
@inproceedings{DBLP:journals/corr/abs-1810-04805,
title = "How Multilingual is Multilingual {BERT}?",
author = "Pires, Telmo and
Schlinger, Eva and
Garrette, Dan",
booktitle = "Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics",
year = "2019",
publisher = "Association for Computational Linguistics",
doi = "10.18653/v1/P19-1493",
pages = "4996--5001",
}
@misc{bert-base-multilingual-cased,
author = {Google Research},
title = {{BERT Base Multilingual Cased}},
year = {2019},
publisher = {Hugging Face},
journal = {Hugging Face Model Hub},
howpublished = {url{https://huggingface.co/google-bert/bert-base-multilingual-cased}},
note = {Accessed: 2024-07-01}
}
@misc{aryabumi2024aya,
title={Aya 23: Open Weight Releases to Further Multilingual Progress},
author={Viraat Aryabumi and John Dang and Dwarak Talupuru and Saurabh Dash and David Cairuz and Hangyu Lin and Bharat Venkitesh and Madeline Smith and Kelly Marchisio and Sebastian Ruder and Acyr Locatelli and Julia Kreutzer and Nick Frosst and Phil Blunsom and Marzieh Fadaee and Ahmet Üstün and Sara Hooker},
year={2024},
eprint={2405.15032},
archivePrefix={arXiv},
primaryClass={cs.CL}
}
@misc{meta_llama_2023,
author = {Meta AI},
title = {Meta-Llama-3-8B-Instruct},
year = 2023,
howpublished = {url{https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct}},
note = {Accessed: 2024-07-01}
}
@misc{jiang2023mistral7b,
title={Mistral 7B},
author={Albert Q. Jiang and Alexandre Sablayrolles and Arthur Mensch and Chris Bamford and Devendra Singh Chaplot and Diego de las Casas and Florian Bressand and Gianna Lengyel and Guillaume Lample and Lucile Saulnier and Lélio Renard Lavaud and Marie-Anne Lachaux and Pierre Stock and Teven Le Scao and Thibaut Lavril and Thomas Wang and Timothée Lacroix and William El Sayed},
year={2023},
eprint={2310.06825},
archivePrefix={arXiv},
primaryClass={cs.CL},
}
@misc{openai2024gpt4technicalreport,
title={GPT-4 Technical Report},
author={OpenAI and Josh Achiam and Steven Adler and Sandhini Agarwal and Lama Ahmad and Ilge Akkaya and Florencia Leoni Aleman and Diogo Almeida and Janko Altenschmidt and Sam Altman and Shyamal Anadkat and Red Avila and Igor Babuschkin and Suchir Balaji and Valerie Balcom and Paul Baltescu and Haiming Bao and Mohammad Bavarian and Jeff Belgum and Irwan Bello and Jake Berdine and Gabriel Bernadett-Shapiro and Christopher Berner and Lenny Bogdonoff and Oleg Boiko and Madelaine Boyd and Anna-Luisa Brakman and Greg Brockman and Tim Brooks and Miles Brundage and Kevin Button and Trevor Cai and Rosie Campbell and Andrew Cann and Brittany Carey and Chelsea Carlson and Rory Carmichael and Brooke Chan and Che Chang and Fotis Chantzis and Derek Chen and Sully Chen and Ruby Chen and Jason Chen and Mark Chen and Ben Chess and Chester Cho and Casey Chu and Hyung Won Chung and Dave Cummings and Jeremiah Currier and Yunxing Dai and Cory Decareaux and Thomas Degry and Noah Deutsch and Damien Deville and Arka Dhar and David Dohan and Steve Dowling and Sheila Dunning and Adrien Ecoffet and Atty Eleti and Tyna Eloundou and David Farhi and Liam Fedus and Niko Felix and Simón Posada Fishman and Juston Forte and Isabella Fulford and Leo Gao and Elie Georges and Christian Gibson and Vik Goel and Tarun Gogineni and Gabriel Goh and Rapha Gontijo-Lopes and Jonathan Gordon and Morgan Grafstein and Scott Gray and Ryan Greene and Joshua Gross and Shixiang Shane Gu and Yufei Guo and Chris Hallacy and Jesse Han and Jeff Harris and Yuchen He and Mike Heaton and Johannes Heidecke and Chris Hesse and Alan Hickey and Wade Hickey and Peter Hoeschele and Brandon Houghton and Kenny Hsu and Shengli Hu and Xin Hu and Joost Huizinga and Shantanu Jain and Shawn Jain and Joanne Jang and Angela Jiang and Roger Jiang and Haozhun Jin and Denny Jin and Shino Jomoto and Billie Jonn and Heewoo Jun and Tomer Kaftan and Łukasz Kaiser and Ali Kamali and Ingmar Kanitscheider and Nitish Shirish Keskar and Tabarak Khan and Logan Kilpatrick and Jong Wook Kim and Christina Kim and Yongjik Kim and Jan Hendrik Kirchner and Jamie Kiros and Matt Knight and Daniel Kokotajlo and Łukasz Kondraciuk and Andrew Kondrich and Aris Konstantinidis and Kyle Kosic and Gretchen Krueger and Vishal Kuo and Michael Lampe and Ikai Lan and Teddy Lee and Jan Leike and Jade Leung and Daniel Levy and Chak Ming Li and Rachel Lim and Molly Lin and Stephanie Lin and Mateusz Litwin and Theresa Lopez and Ryan Lowe and Patricia Lue and Anna Makanju and Kim Malfacini and Sam Manning and Todor Markov and Yaniv Markovski and Bianca Martin and Katie Mayer and Andrew Mayne and Bob McGrew and Scott Mayer McKinney and Christine McLeavey and Paul McMillan and Jake McNeil and David Medina and Aalok Mehta and Jacob Menick and Luke Metz and Andrey Mishchenko and Pamela Mishkin and Vinnie Monaco and Evan Morikawa and Daniel Mossing and Tong Mu and Mira Murati and Oleg Murk and David Mély and Ashvin Nair and Reiichiro Nakano and Rajeev Nayak and Arvind Neelakantan and Richard Ngo and Hyeonwoo Noh and Long Ouyang and Cullen O'Keefe and Jakub Pachocki and Alex Paino and Joe Palermo and Ashley Pantuliano and Giambattista Parascandolo and Joel Parish and Emy Parparita and Alex Passos and Mikhail Pavlov and Andrew Peng and Adam Perelman and Filipe de Avila Belbute Peres and Michael Petrov and Henrique Ponde de Oliveira Pinto and Michael and Pokorny and Michelle Pokrass and Vitchyr H. Pong and Tolly Powell and Alethea Power and Boris Power and Elizabeth Proehl and Raul Puri and Alec Radford and Jack Rae and Aditya Ramesh and Cameron Raymond and Francis Real and Kendra Rimbach and Carl Ross and Bob Rotsted and Henri Roussez and Nick Ryder and Mario Saltarelli and Ted Sanders and Shibani Santurkar and Girish Sastry and Heather Schmidt and David Schnurr and John Schulman and Daniel Selsam and Kyla Sheppard and Toki Sherbakov and Jessica Shieh and Sarah Shoker and Pranav Shyam and Szymon Sidor and Eric Sigler and Maddie Simens and Jordan Sitkin and Katarina Slama and Ian Sohl and Benjamin Sokolowsky and Yang Song and Natalie Staudacher and Felipe Petroski Such and Natalie Summers and Ilya Sutskever and Jie Tang and Nikolas Tezak and Madeleine B. Thompson and Phil Tillet and Amin Tootoonchian and Elizabeth Tseng and Preston Tuggle and Nick Turley and Jerry Tworek and Juan Felipe Cerón Uribe and Andrea Vallone and Arun Vijayvergiya and Chelsea Voss and Carroll Wainwright and Justin Jay Wang and Alvin Wang and Ben Wang and Jonathan Ward and Jason Wei and CJ Weinmann and Akila Welihinda and Peter Welinder and Jiayi Weng and Lilian Weng and Matt Wiethoff and Dave Willner and Clemens Winter and Samuel Wolrich and Hannah Wong and Lauren Workman and Sherwin Wu and Jeff Wu and Michael Wu and Kai Xiao and Tao Xu and Sarah Yoo and Kevin Yu and Qiming Yuan and Wojciech Zaremba and Rowan Zellers and Chong Zhang and Marvin Zhang and Shengjia Zhao and Tianhao Zheng and Juntang Zhuang and William Zhuk and Barret Zoph},
year={2024},
eprint={2303.08774},
archivePrefix={arXiv},
primaryClass={cs.CL},
}
@article{singhal2022largelanguagemodelsencode,
title = "Large language models encode clinical knowledge",
author = "Singhal, Karan and Azizi, Shekoofeh and Tu, Tao and Mahdavi, S
Sara and Wei, Jason and Chung, Hyung Won and Scales, Nathan and
Tanwani, Ajay and Cole-Lewis, Heather and Pfohl, Stephen and
Payne, Perry and Seneviratne, Martin and Gamble, Paul and Kelly,
Chris and Babiker, Abubakr and Schärli, Nathanael and Chowdhery,
Aakanksha and Mansfield, Philip and Demner-Fushman, Dina and
Agüera Y Arcas, Blaise and Webster, Dale and Corrado, Greg S and
Matias, Yossi and Chou, Katherine and Gottweis, Juraj and
Tomasev, Nenad and Liu, Yun and Rajkomar, Alvin and Barral,
Joelle and Semturs, Christopher and Karthikesalingam, Alan and
Natarajan, Vivek",
journal = "Nature",
volume = 620,
pages = "172--180",
year = 2023,
doi = "10.1038/s41586-023-06291-2"
}
@misc{guo2024generativeaisyntheticdata,
title={Generative AI for Synthetic Data Generation: Methods, Challenges and the Future},
author={Xu Guo and Yiqiang Chen},
year={2024},
eprint={2403.04190},
archivePrefix={arXiv},
primaryClass={cs.LG},
}
@inproceedings{li-etal-2023-two,
title = "Two Directions for Clinical Data Generation with Large Language Models: Data-to-Label and Label-to-Data",
author = "Li, Rumeng and
Wang, Xun and
Yu, Hong",
booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2023",
year = "2023",
doi = "10.18653/v1/2023.findings-emnlp.474",
pages = "7129--7143",
}
@misc{gemma_2024,
title={Gemma 2: Improving Open Language Models at a Practical Size},
url={https://storage.googleapis.com/deepmind-media/gemma/gemma-2-report.pdf},
author={Gemma Team, Google DeepMind},
year={2024}
}
@misc{abdin2024phi3technicalreporthighly,
title={Phi-3 Technical Report: A Highly Capable Language Model Locally on Your Phone},
author={Marah Abdin and Sam Ade Jacobs and Ammar Ahmad Awan and Jyoti Aneja and Ahmed Awadallah and Hany Awadalla and Nguyen Bach and Amit Bahree and Arash Bakhtiari and Jianmin Bao and Harkirat Behl and Alon Benhaim and Misha Bilenko and Johan Bjorck and Sébastien Bubeck and Qin Cai and Martin Cai and Caio César Teodoro Mendes and Weizhu Chen and Vishrav Chaudhary and Dong Chen and Dongdong Chen and Yen-Chun Chen and Yi-Ling Chen and Parul Chopra and Xiyang Dai and Allie Del Giorno and Gustavo de Rosa and Matthew Dixon and Ronen Eldan and Victor Fragoso and Dan Iter and Mei Gao and Min Gao and Jianfeng Gao and Amit Garg and Abhishek Goswami and Suriya Gunasekar and Emman Haider and Junheng Hao and Russell J. Hewett and Jamie Huynh and Mojan Javaheripi and Xin Jin and Piero Kauffmann and Nikos Karampatziakis and Dongwoo Kim and Mahoud Khademi and Lev Kurilenko and James R. Lee and Yin Tat Lee and Yuanzhi Li and Yunsheng Li and Chen Liang and Lars Liden and Ce Liu and Mengchen Liu and Weishung Liu and Eric Lin and Zeqi Lin and Chong Luo and Piyush Madan and Matt Mazzola and Arindam Mitra and Hardik Modi and Anh Nguyen and Brandon Norick and Barun Patra and Daniel Perez-Becker and Thomas Portet and Reid Pryzant and Heyang Qin and Marko Radmilac and Corby Rosset and Sambudha Roy and Olatunji Ruwase and Olli Saarikivi and Amin Saied and Adil Salim and Michael Santacroce and Shital Shah and Ning Shang and Hiteshi Sharma and Swadheen Shukla and Xia Song and Masahiro Tanaka and Andrea Tupini and Xin Wang and Lijuan Wang and Chunyu Wang and Yu Wang and Rachel Ward and Guanhua Wang and Philipp Witte and Haiping Wu and Michael Wyatt and Bin Xiao and Can Xu and Jiahang Xu and Weijian Xu and Sonali Yadav and Fan Yang and Jianwei Yang and Ziyi Yang and Yifan Yang and Donghan Yu and Lu Yuan and Chengruidong Zhang and Cyril Zhang and Jianwen Zhang and Li Lyna Zhang and Yi Zhang and Yue Zhang and Yunan Zhang and Xiren Zhou},
year={2024},
eprint={2404.14219},
archivePrefix={arXiv},
primaryClass={cs.CL},
}
@misc{lu2024machinelearningsyntheticdata,
title={Machine Learning for Synthetic Data Generation: A Review},
author={Yingzhou Lu and Minjie Shen and Huazheng Wang and Xiao Wang and Capucine van Rechem and Tianfan Fu and Wenqi Wei},
year={2024},
eprint={2302.04062},
archivePrefix={arXiv},
primaryClass={cs.LG},
}
@misc{tang2023doessyntheticdatageneration,
title={Does Synthetic Data Generation of LLMs Help Clinical Text Mining?},
author={Ruixiang Tang and Xiaotian Han and Xiaoqian Jiang and Xia Hu},
year={2023},
eprint={2303.04360},
archivePrefix={arXiv},
primaryClass={cs.CL},
}
@article{Hamel2007-td,
author = {Hamel, Rainer},
year = {2007},
month = {12},
pages = {53-71},
title = {The dominance of English in the international scientific periodical literature and the future of language use in science},
volume = {20},
journal = {AILA Review},
doi = {10.1075/aila.20.06ham}
}
@misc{allal2024SmolLM,
title={SmolLM - blazingly fast and remarkably powerful},
author={Loubna Ben Allal and Anton Lozhkov and Elie Bakouch and Leandro von Werra and Thomas Wolf},
year={2024},
}
@INPROCEEDINGS{Dolinar2024-th,
title = "Generating Non-English Synthetic Medical Data Sets",
author = "Dolinar, Lenart and Calcina, Erik and Novak, Erik",
booktitle = "Proceedings of the Slovenian KDD Conference",
year = 2024
}
@misc{belkadi2024generatingsyntheticfreetextmedical,
title={Generating Synthetic Free-text Medical Records with Low Re-identification Risk using Masked Language Modeling},
author={Samuel Belkadi and Libo Ren and Nicolo Micheletti and Lifeng Han and Goran Nenadic},
year={2024},
eprint={2409.09831},
archivePrefix={arXiv},
primaryClass={cs.CL},
url={https://arxiv.org/abs/2409.09831},
}
DOI: https://doi.org/10.31449/inf.v49i27.7763

This work is licensed under a Creative Commons Attribution 3.0 License.