Align
datasets:
- data_path: /root/autodl-tmp/data/mllm_datasets/meta_files/LLaVA-Pretrain/llava-pretrain-refine-500k.json
sampling_strategy: "all"
Pretrain
v1-0410
datasets:
# Caption
- data_path: /root/autodl-tmp/data/mllm_datasets/meta_files/Caption/pixelprose-14m/pixelprose-processed-2m.json
sampling_strategy: "all"
- data_path: /root/autodl-tmp/data/mllm_datasets/meta_files/ReCap/BLIP-ReCap-558K.json
sampling_strategy: "all"
- data_path: /root/autodl-tmp/data/mllm_datasets/meta_files/ReCap/COCO-ReCap-118K.json
sampling_strategy: "all"
- data_path: /root/autodl-tmp/data/mllm_datasets/meta_files/ReCap/CC3M-ReCap.json
sampling_strategy: "all"
# Documents
- data_path: /root/autodl-tmp/data/mllm_datasets/meta_files/Documents/ureader_tr_101k.json
sampling_strategy: "all"
# OCR
- data_path: /root/autodl-tmp/data/mllm_datasets/meta_files/OCR/synthdog_en_100k.json
sampling_strategy: "all"
- data_path: /root/autodl-tmp/data/mllm_datasets/meta_files/OCR/synthdog_zh_100k.json
sampling_strategy: "all"
# Text
- data_path: /root/autodl-tmp/data/mllm_datasets/meta_files/Text/evol_instruct_143k.json
sampling_strategy: "all"
# OMS
- data_path: /root/autodl-tmp/data/mllm_datasets/meta_files/OMS/train_json/oms_train_qwen_caption_zh_4827_v2.json
sampling_strategy: "all"
- data_path: /root/autodl-tmp/data/mllm_datasets/meta_files/OMS/train_json/oms_train_qwen_caption_en_4827_v2.json
sampling_strategy: "all"
V2-0512
datasets:
#Interleaved
- data_path: /root/autodl-tmp/data/mllm_datasets/meta_files/InterLeaving/Omnicorpus/CC-MAIN-2023-14_last40_32k.json
sampling_strategy: "all"
datasets:
# Caption
- data_path: /root/autodl-tmp/data/mllm_datasets/meta_files/Caption/pixelprose/pixelprose-processed-dedup-2m.json
sampling_strategy: "all"
- data_path: /root/autodl-tmp/data/mllm_datasets/meta_files/ReCap/BLIP-ReCap-dedup-550K.json
sampling_strategy: "all"
- data_path: /root/autodl-tmp/data/mllm_datasets/meta_files/ReCap/COCO-ReCap-dedup-118K.json
sampling_strategy: "all"
- data_path: /root/autodl-tmp/data/mllm_datasets/meta_files/ReCap/CC3M-ReCap-dedup.json
sampling_strategy: "all"
- data_path: /root/autodl-tmp/data/mllm_datasets/meta_files/Caption/SA-Caption/sa_caption_zh_19k.json
sampling_strategy: "all"
# OCR
- data_path: /root/autodl-tmp/data/mllm_datasets/meta_files/OCR/ureader_tr_101k.json
sampling_strategy: "all"
- data_path: /root/autodl-tmp/data/mllm_datasets/meta_files/OCR/synthdog_en_100k.json
sampling_strategy: "all"
- data_path: /root/autodl-tmp/data/mllm_datasets/meta_files/OCR/synthdog_zh_100k.json
sampling_strategy: "all"
- data_path: /root/autodl-tmp/data/mllm_datasets/meta_files/OCR/synthdog_ko_500k.json
sampling_strategy: "all"
- data_path: /root/autodl-tmp/data/mllm_datasets/meta_files/OCR/synthdog_ja_500k.json
sampling_strategy: "all"
- data_path: /root/autodl-tmp/data/mllm_datasets/meta_files/OCR/EST-VQA_17k.json
sampling_strategy: "all"
- data_path: /root/autodl-tmp/data/mllm_datasets/meta_files/OCR/OCR-VQA_207k.json
sampling_strategy: "all"
- data_path: /root/autodl-tmp/data/mllm_datasets/meta_files/OCR/LSVT/LSVT-2019-fulllabel.json
sampling_strategy: "all"
# Text
- data_path: /root/autodl-tmp/data/mllm_datasets/meta_files/Text/evol_instruct_143k.json
sampling_strategy: "all"
- data_path: /root/autodl-tmp/data/mllm_datasets/meta_files/Text/smollm-corpus/cosmopedia-v2/cosmopedia-v2-train-0-2-1M.json
sampling_strategy: "all"
- data_path: /root/autodl-tmp/data/mllm_datasets/meta_files/Text/SCP-116K/SCP_274K.json
sampling_strategy: "all"
# Grounding
- data_path: /root/autodl-tmp/data/mllm_datasets/meta_files/Grounding/Ref_L4/Ref_L4_dedup_norm_18k.json
sampling_strategy: "all"
- data_path: /root/autodl-tmp/data/mllm_datasets/meta_files/Grounding/Objects365_v1_norm_623k.json
sampling_strategy: "all"
# OMS
- data_path: /root/autodl-tmp/data/mllm_datasets/meta_files/OMS/train_json/oms_train_qwen_caption_zh_4827_v2.json
sampling_strategy: "all"
- data_path: /root/autodl-tmp/data/mllm_datasets/meta_files/OMS/train_json/oms_train_qwen_caption_en_4827_v2.json
sampling_strategy: "all"
v1数据去重
SFT
V1
模型实验