File size: 11,585 Bytes
a9e1e1a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 |
from datasets import Dataset, Features, Value, ClassLabel, Sequence, Image
import json
import PIL.Image as pil_image
from io import BytesIO
from tqdm import tqdm
json_paths = [
# "/mnt/bn/vl-research/data/llava_instruct/real_vision_flan/mavis_math_metagen_87358.json",
# "/mnt/bn/vl-research/data/llava_instruct/real_vision_flan/mavis_math_rule_geo_100000.json",
# "/mnt/bn/vl-research/data/llava_instruct/real_vision_flan/k12_printing_train_256646.json",
# "/mnt/bn/vl-research/data/llava_instruct/real_vision_flan/iiit5k_annotations_2000.json",
# "/mnt/bn/vl-research/data/llava_instruct/real_vision_flan/hme100k_train_clean_74502.json",
# "/mnt/bn/vl-research/data/llava_instruct/real_vision_flan/ai2d_azuregpt_detailed_understanding_4874.json",
# "/mnt/bn/vl-research/data/llava_instruct/real_vision_flan/infographic_vqa_4404.json",
# "/mnt/bn/vl-research/data/llava_instruct/real_vision_flan/infographic_azuregpt4v_1992.json",
# "/mnt/bn/vl-research/data/llava_instruct/real_vision_flan/lrv_chart_1787.json",
# "/mnt/bn/vl-research/data/llava_instruct/real_vision_flan/lrv_normal_gpt4v_filtered_10500.json",
# "/mnt/bn/vl-research/data/llava_instruct/real_vision_flan/scienceqa_nona_context_19218.json",
# "/mnt/bn/vl-research/data/llava_instruct/real_vision_flan/allava_instruct_vflan4v_20000.json",
# "/mnt/bn/vl-research/data/llava_instruct/real_vision_flan/allava_instruct_laion4v_50000.json",
# "/mnt/bn/vl-research/data/llava_instruct/real_vision_flan/textocr_gpt4v_train_converted_25114.json",
# "/mnt/bn/vl-research/data/llava_instruct/real_vision_flan/ai2d_train_internvl_single_12413.json",
# "/mnt/bn/vl-research/data/llava_instruct/real_vision_flan/textcaps_train_21952.json",
# "/mnt/bn/vl-research/data/llava_instruct/ureader_new/ureader_qa_sft.json",
# "/mnt/bn/vl-research/data/llava_instruct/ureader_new/ureader_cap_sft.json",
# "/mnt/bn/vl-research/data/llava_instruct/ureader_new/ureader_ie_sft.json",
# "/mnt/bn/vl-research/data/llava_instruct/ureader_new/ureader_kg_sft.json",
# "/mnt/bn/vl-research/data/llava_instruct/real_vision_flan/vision_flan_filtered_186070.json",
# "/mnt/bn/vl-research/data/llava_instruct/real_vision_flan/mathqa_29837.json",
# "/mnt/bn/vl-research/data/llava_instruct/real_vision_flan/geo3k_2101.json",
# "/mnt/bn/vl-research/data/llava_instruct/real_vision_flan/geo170k_qa_converted_67833.json",
# "/mnt/bn/vl-research/data/llava_instruct/real_vision_flan/geo170k_align_converted_60252.json",
# "/mnt/bn/vl-research/data/llava_instruct/real_vision_flan/sharegpt4v-coco-50k.json",
# "/mnt/bn/vl-research/data/llava_instruct/real_vision_flan/sharegpt4v-knowledge-2k.json",
# "/mnt/bn/vl-research/data/llava_instruct/real_vision_flan/sharegpt4v-llava-30k.json",
# "/mnt/bn/vl-research/data/llava_instruct/real_vision_flan/sharegpt4v-sam-20k.json",
# "/mnt/bn/vl-research/data/llava_instruct/real_vision_flan/MathV360K_CLEVR-Math_5290.json",
# "/mnt/bn/vl-research/data/llava_instruct/real_vision_flan/MathV360K_FigureQA_17597.json",
# "/mnt/bn/vl-research/data/llava_instruct/real_vision_flan/MathV360K_Geometry3K_9734.json",
# "/mnt/bn/vl-research/data/llava_instruct/real_vision_flan/MathV360K_GeoQA+_17172.json",
# "/mnt/bn/vl-research/data/llava_instruct/real_vision_flan/MathV360K_GEOS_508.json",
# "/mnt/bn/vl-research/data/llava_instruct/real_vision_flan/MathV360K_IconQA_22599.json",
# "/mnt/bn/vl-research/data/llava_instruct/real_vision_flan/MathV360K_MapQA_5235.json",
# "/mnt/bn/vl-research/data/llava_instruct/real_vision_flan/MathV360K_PMC-VQA_35958.json",
# "/mnt/bn/vl-research/data/llava_instruct/real_vision_flan/MathV360K_Super-CLEVR_8652.json",
# "/mnt/bn/vl-research/data/llava_instruct/real_vision_flan/MathV360K_TabMWP_22462.json",
# "/mnt/bn/vl-research/data/llava_instruct/real_vision_flan/MathV360K_UniGeo_11959.json",
# "/mnt/bn/vl-research/data/llava_instruct/real_vision_flan/MathV360K_VizWiz_6614.json",
# "/mnt/bn/vl-research/data/llava_instruct/real_vision_flan/magpie_pro_qwen2_72b_st_300000_sp_token_fltd_299992.json",
# "/mnt/bn/vl-research/data/llava_instruct/real_vision_flan/magpie_pro_l3_80b_st_300000.json",
# "/mnt/bn/vl-research/data/llava_instruct/real_vision_flan/magpie_pro_l3_80b_mt_300000_sp_token_fltd_299998.json",
# "/mnt/bn/vl-research/data/llava_instruct/real_vision_flan/image_textualization_dataset_filtered.json",
# "/mnt/bn/vl-research/data/llava_instruct/real_vision_flan/cambrian_filtered_gpt4vo_sp_token_fltd_max10k.json",
"/mnt/bn/vl-research/data/llava_instruct/real_vision_flan/sharegpt4o_dataset.jsonl",
"/mnt/bn/vl-research/data/llava_instruct/cauldron/ai2d_llava_format_2434.json",
"/mnt/bn/vl-research/data/llava_instruct/cauldron/aokvqa_16539_llava_format.json",
"/mnt/bn/vl-research/data/llava_instruct/cauldron/chart2text_26961.json",
"/mnt/bn/vl-research/data/llava_instruct/cauldron/chartqa_18265_llava_format.json",
"/mnt/bn/vl-research/data/llava_instruct/cauldron/clevr_70000_llava_format.json",
"/mnt/bn/vl-research/data/llava_instruct/cauldron/diagram_image_to_text_300.json",
"/mnt/bn/vl-research/data/llava_instruct/cauldron/dvqa_200000_llava_format.json",
"/mnt/bn/vl-research/data/llava_instruct/cauldron/figureqa_100000_llava_format.json",
"/mnt/bn/vl-research/data/llava_instruct/cauldron/geomverse_9303.json",
"/mnt/bn/vl-research/data/llava_instruct/cauldron/hateful_memes_8500_llava_format.json",
"/mnt/bn/vl-research/data/llava_instruct/cauldron/hitab_2500_llava_format.json",
"/mnt/bn/vl-research/data/llava_instruct/cauldron/iam_5663.json",
"/mnt/bn/vl-research/data/llava_instruct/cauldron/raven_42000.json",
"/mnt/bn/vl-research/data/llava_instruct/cauldron/iconqa_llava_format_27307.json",
"/mnt/bn/vl-research/data/llava_instruct/cauldron/infographic_vqa_2118_llava_format.json",
"/mnt/bn/vl-research/data/llava_instruct/cauldron/intergps_1280_llava_format.json",
"/mnt/bn/vl-research/data/llava_instruct/cauldron/mapqa_37417_llava_format.json",
"/mnt/bn/vl-research/data/llava_instruct/cauldron/multihiertt_7619.json",
"/mnt/bn/vl-research/data/llava_instruct/cauldron/rendered_text_10000.json",
"/mnt/bn/vl-research/data/llava_instruct/cauldron/robut_sqa_8514.json",
"/mnt/bn/vl-research/data/llava_instruct/cauldron/robut_wikisql_74989.json",
"/mnt/bn/vl-research/data/llava_instruct/cauldron/robut_wtq_38246_llava_format.json",
"/mnt/bn/vl-research/data/llava_instruct/cauldron/screen2words_15730.json",
"/mnt/bn/vl-research/data/llava_instruct/cauldron/scienceqa_llava_format_4976.json",
"/mnt/bn/vl-research/data/llava_instruct/cauldron/tabmwp_22722.json",
"/mnt/bn/vl-research/data/llava_instruct/cauldron/tallyqa_98680_llava_format.json",
"/mnt/bn/vl-research/data/llava_instruct/cauldron/st_vqa_17247_llava_format.json",
"/mnt/bn/vl-research/data/llava_instruct/cauldron/tqa_llava_format_27307.json",
"/mnt/bn/vl-research/data/llava_instruct/cauldron/visual7w_llava_format_14366.json",
"/mnt/bn/vl-research/data/llava_instruct/cauldron/visualmrc_3027.json",
"/mnt/bn/vl-research/data/llava_instruct/cauldron/vqarad_313_llava_format.json",
"/mnt/bn/vl-research/data/llava_instruct/cauldron/vsr_2157_llava_format.json",
"/mnt/bn/vl-research/data/llava_instruct/cauldron/vistext_9969.json",
"/mnt/bn/vl-research/data/llava_instruct/cauldron/websight_10000.json"
]
short_names = [
# "mavis_math_metagen",
# "mavis_math_rule_geo",
# "k12_printing",
# "iiit5k",
# "hme100k",
# "ai2d(gpt4v)",
# "infographic_vqa",
# "infographic(gpt4v)",
# "lrv_chart",
# "lrv_normal(filtered)",
# "scienceqa(nona_context)",
# "allava_instruct_vflan4v",
# "allava_instruct_laion4v",
# "textocr(gpt4v)",
# "ai2d(internvl)",
# "textcaps",
# "ureader_qa", # need to re-upload
# "ureader_cap", # need to re-upload
# "ureader_ie", # need to re-upload
# "ureader_kg", # need to re-upload
# "vision_flan(filtered)",
# "mathqa",
# "geo3k",
# "geo170k(qa)",
# "geo170k(align)",
# "sharegpt4v(coco)",
# "sharegpt4v(knowledge)",
# "sharegpt4v(llava)",
# "sharegpt4v(sam)",
# "CLEVR-Math(MathV360K)",
# "FigureQA(MathV360K)",
# "Geometry3K(MathV360K)",
# "GeoQA+(MathV360K)",
# "GEOS(MathV360K)",
# "IconQA(MathV360K)",
# "MapQA(MathV360K)",
# "PMC-VQA(MathV360K)",
# "Super-CLEVR(MathV360K)",
# "TabMWP(MathV360K)",
# "UniGeo(MathV360K)",
# "VizWiz(MathV360K)",
# "magpie_pro(qwen2_72b_st)",
# "magpie_pro(l3_80b_st)",
# "magpie_pro(l3_80b_mt)",
# "image_textualization(filtered)",
# "cambrian(filtered_gpt4vo)", # need to re-upload
"sharegpt4o",
"ai2d(cauldron,llava_format)",
"aokvqa(cauldron,llava_format)",
"chart2text(cauldron)",
"chartqa(cauldron,llava_format)",
"clevr(cauldron,llava_format)",
"diagram_image_to_text(cauldron)",
"dvqa(cauldron,llava_format)",
"figureqa(cauldron,llava_format)",
"geomverse(cauldron)",
"hateful_memes(cauldron,llava_format)",
"hitab(cauldron,llava_format)",
"iam(cauldron)",
"raven(cauldron)",
"iconqa(cauldron,llava_format)",
"infographic_vqa_llava_format",
"intergps(cauldron,llava_format)",
"mapqa(cauldron,llava_format)",
"multihiertt(cauldron)",
"rendered_text(cauldron)",
"robut_sqa(cauldron)",
"robut_wikisql(cauldron)",
"robut_wtq(cauldron,llava_format)",
"screen2words(cauldron)",
"scienceqa(cauldron,llava_format)",
"tabmwp(cauldron)",
"tallyqa(cauldron,llava_format)",
"st_vqa(cauldron,llava_format)",
"tqa(cauldron,llava_format)",
"visual7w(cauldron,llava_format)",
"visualmrc(cauldron)",
"vqarad(cauldron,llava_format)",
"vsr(cauldron,llava_format)",
"vistext(cauldron)",
"websight(cauldron)"
]
def upload_data(json_path, short_name):
def gen():
if json_path.endswith(".jsonl"):
with open(json_path, "r") as f:
data = [json.loads(line) for line in f]
else:
with open(json_path, "r") as f:
data = json.load(f)
preview_index = 5
idx = 0
for item in tqdm(data):
if preview_index > 0:
preview_index -= 1
print(item)
continue
try:
if "image" in item:
image_path = f"/mnt/bn/vl-research/data/llava_data/{item['image']}"
try:
with open(image_path, "rb") as img_file:
image = pil_image.open(BytesIO(img_file.read()))
except:
print(f"Failed to load image {item['image']}")
continue
else:
image = None
item_id = item["id"] if "id" in item else f"{idx:06d}"
yield {"id": item_id, "image": image, "conversations": item["conversations"], "data_source": short_name}
idx += 1
except Exception as e:
print(e)
continue
hf_dataset = Dataset.from_generator(generator=gen, num_proc=32)
hf_dataset.push_to_hub("lmms-lab/LLaVA-OneVision-Data", config_name=short_name, split="train")
for json_path, short_name in zip(json_paths, short_names):
upload_data(json_path, short_name) |