#!/bin/bash # Exit on error, uninitialized var, and ensure commands in pipes are all checked for success set -euo pipefail # Input arguments - Image directory path, output predictions directory path, checkpoints directory path containing all checkpoints and directory containing original SAM annotation files IMG_DIR=$1 PRED_DIR=$2 CKPT_DIR=$3 SAM_ANNOTATIONS_DIR=$4 # Adjust below configuration as per your setup NUM_GPUs=1 GPU_IDs="0" MASTER_PORT=1342 # NOTE: The pipeline contains multiple models from different open-source resources. The dependencies to run varies from one model to other. That's why, we had to create almost 10 different conda environments with different dependencies to run the complete pipeline. Please follow the instructions at the corresponding model directory to install the dependencies. We will welcome any pull request to make this process easy. Thank You. # We define some commands below to activate the correct conda environments run_in_env() { local env="$1" shift source $(conda info --base)/etc/profile.d/conda.sh conda activate "$env" "$@" } run_in_env_targeted() { local env="$1" shift export CUDA_VISIBLE_DEVICES=$GPU_IDsS source $(conda info --base)/etc/profile.d/conda.sh conda activate "$env" "$@" } # NOTE: Here we assume to have ten conda environments created, namely grand_env_1, grand_env_2, ---, grand_env_9 and grand_env_utils. The requirements for these environments is available under environments directory. # 1. Landmark run_in_env grand_env_1 pushd level_1_inference/1_landmark_categorization python infer.py --image_dir_path "$IMG_DIR" --output_dir_path "$PRED_DIR" --gpu_ids "$GPU_IDsS" --llava_model_path "$CKPT_DIR/llava-v1-0719-336px-lora-merge-vicuna-13b-v1.3" popd # 2. Depth Maps run_in_env_targeted grand_env_2 level_1_inference/pushd 2_depth_maps python -m torch.distributed.launch --nproc_per_node="$NUM_GPUs" --master_port="$MASTER_PORT" --use_env infer.py --image_dir_path "$IMG_DIR" --output_dir_path "$PRED_DIR" --model_weights "$CKPT_DIR/dpt_beit_large_512.pt" popd # 3. Image Tagging run_in_env_targeted grand_env_3 pushd level_1_inference/3_image_tagging python -m torch.distributed.launch --nproc_per_node="$NUM_GPUs" --master_port="$MASTER_PORT" --use_env infer.py --image_dir_path "$IMG_DIR" --output_dir_path "$PRED_DIR" --model-type tag2text --checkpoint "$CKPT_DIR/tag2text_swin_14m.pth" python -m torch.distributed.launch --nproc_per_node="$NUM_GPUs" --master_port="$MASTER_PORT" --use_env infer.py --image_dir_path "$IMG_DIR" --output_dir_path "$PRED_DIR" --model-type ram --checkpoint "$CKPT_DIR/ram_swin_large_14m.pth" popd # 4. Object Detection using Co-DETR run_in_env grand_env_1 pushd level_1_inference/4_co_detr python launch_codetr_multi_gpu_inference.py --image_dir_path "$IMG_DIR" --output_dir_path "$PRED_DIR" --ckpt_path "$CKPT_DIR/co_deformable_detr_swin_large_900q_3x_coco.pth" --gpu_ids "$GPU_IDs" popd # 5. Object Detection using EVA-02 run_in_env_targeted grand_env_4 pushd level_1_inference/5_eva_02 python -m torch.distributed.launch --nproc_per_node="$NUM_GPUs" --master_port="$MASTER_PORT" --use_env infer.py --image_dir_path "$IMG_DIR" --output_dir_path "$PRED_DIR" --model_name 'eva-02-01' python -m torch.distributed.launch --nproc_per_node="$NUM_GPUs" --master_port="$MASTER_PORT" --use_env infer.py --image_dir_path "$IMG_DIR" --output_dir_path "$PRED_DIR" --model_name 'eva-02-02' popd # 6. Open Vocabulary Detection using OWL-ViT run_in_env grand_env_1 pushd level_1_inference/6_owl_vit python launch_owl_vit_multi_gpu_inference.py --image_dir_path "$IMG_DIR" --output_dir_path "$PRED_DIR" --tags_dir_path "$PRED_DIR" --gpu_ids "$GPU_IDs" popd # 7. Open Vocabulary Detection using POMP run_in_env grand_env_4 pushd level_1_inference/7_pomp python launch_pomp_multi_gpu_inference.py --image_dir_path "$IMG_DIR" --output_dir_path "$PRED_DIR" --tags_dir_path "$PRED_DIR" --gpu_ids "$GPU_IDs" popd # 8. Attribute Detection and Grounding using GRIT run_in_env_targeted grand_env_3 level_1_inference/pushd 8_grit \ python -m torch.distributed.launch --nproc_per_node="$NUM_GPUs" --master_port="$MASTER_PORT" --use_env infer.py --image_dir_path "$IMG_DIR" --output_dir_path "$PRED_DIR" popd # 9. Open Vocabulary Classification using OV-SAM run_in_env grand_env_5 pushd level_1_inference/9_ov_sam python launch_ov_sam_multi_gpu_inference.py --image_dir_path "$IMG_DIR" --output_dir_path "$PRED_DIR" --sam_annotations_dir "$SAM_ANNOTATIONS_DIR" --gpu_ids "$GPU_IDs" popd # 10. Generate Level-1 Scene Graph run_in_env grand_env_utils python utils/merge_json_level_1_with_nms.py --image_dir_path "$IMG_DIR" --predictions_dir_path "$PRED_DIR" --output_dir_path "$PRED_DIR/level-1-raw" run_in_env grand_env_utils python utils/prepare_level_1.py --image_dir_path "$IMG_DIR" --raw_dir_path "$PRED_DIR/level-1-raw" --output_dir_path "$PRED_DIR/level-1-processed" # -------------------------------------------------------------------------------------------------------------------- # # 11. Captioning using BLIP-2 run_in_env_targeted grand_env_3 pushd level_2_inference/1_blip-2 python -m torch.distributed.launch --nproc_per_node="$NUM_GPUs" --master_port="$MASTER_PORT" --use_env infer.py --image_dir_path "$IMG_DIR" --output_dir_path "$PRED_DIR" popd # 12. Captioning using LLaVA run_in_env grand_env_6 pushd level_2_inference/2_llava python infer.py --image_dir_path "$IMG_DIR" --output_dir_path "$PRED_DIR" --gpu_ids "$GPU_IDs" --llava_model_path "$CKPT_DIR/llava-v1-0719-336px-lora-merge-vicuna-13b-v1.3" popd # 13. Grounding using MDETR run_in_env_targeted grand_env_7 pushd level_2_inference/3_mdetr python -m torch.distributed.launch --nproc_per_node="$NUM_GPUs" --master_port="$MASTER_PORT" --use_env infer.py --image_dir_path "$IMG_DIR" --output_dir_path "$PRED_DIR" --blip2_pred_path "$PRED_DIR/blip2" --llava_pred_path "$PRED_DIR/llava" popd # 14. Generate Level-2 Scene Graph and Update Level-1 run_in_env grand_env_utils python utils/merge_json_level_2.py --predictions_dir_path "$PRED_DIR" --output_dir_path "$PRED_DIR/level-2-raw" run_in_env grand_env_utils python utils/prepare_level_2.py --raw_dir_path "$PRED_DIR/level-2-raw" --level_2_output_dir_path "$PRED_DIR/level-2-processed" --level_1_dir_path "$PRED_DIR/level-1-processed" # -------------------------------------------------------------------------------------------------------------------- # # 15. Enrich Attributes using GPT4RoI run_in_env grand_env_8 pushd level_2_inference/4_gpt4roi/GPT4RoI python -m torch.distributed.launch --nproc_per_node="$NUM_GPUs" --master_port="$MASTER_PORT" --use_env gpt4roi/infer.py --image_dir_path "$IMG_DIR" --level_2_pred_path "$PRED_DIR/level-2-processed" --output_dir_path "$PRED_DIR/level-2-processed_gpt4roi" popd # 16. Label Assignment using EVA-CLIP run_in_env_targeted grand_env_4 pushd level_2_inference/5_label_assignment python -m torch.distributed.launch --nproc_per_node="$NUM_GPUs" --master_port="$MASTER_PORT" --use_env infer.py --image_dir_path "$IMG_DIR" --level_2_dir_path "$PRED_DIR/level-2-processed_gpt4roi" --output_dir_path "$PRED_DIR/level-2-processed_eva_clip" popd # 17. Merge EVA-CLIP Assigned Labels & Calculate and Store Depths for All Objects run_in_env_targeted grand_env_utils python utils/merge_eva_labels.py --level_2_dir_path "$PRED_DIR/level-2-processed_gpt4roi" --labels_path "$PRED_DIR/level-2-processed_eva_clip" --output_dir_path "$PRED_DIR/level-2-processed_labelled" --store_depth --depth_map_dir "$PRED_DIR/midas" # -------------------------------------------------------------------------------------------------------------------- # # 18. Generate Level-3 Dense Captions run_in_env grand_env_9 pushd level_3_dense_caption python run.py --image_dir_path "$IMG_DIR" --level_2_dir_path "$PRED_DIR/level-2-processed_labelled" --output_dir_path "$PRED_DIR/level-3-vicuna-13B" --gpu_ids "$GPU_IDs" --job_id '111' popd # 19. Generate Level-4 Additional Context run_in_env grand_env_9 pushd level_4_extra_context python run.py --image_dir_path "$IMG_DIR" --level_2_dir_path "$PRED_DIR/level-2-processed_labelled" --output_dir_path "$PRED_DIR/level-4-vicuna-13B" --gpu_ids "$GPU_IDs" --job_id '111' popd # -------------------------------------------------------------------------------------------------------------------- # # 20. Ground short & dense captions run_in_env_targeted grand_env_utils python utils/ground_short_captions.py --data_dir_path "$PRED_DIR/level-2-processed_labelled" --output_dir_path "$PRED_DIR/short_captions_grounded" run_in_env_targeted grand_env_utils python utils/ground_dense_caption.py --level_3_dense_caption_txt_dir_path "$PRED_DIR/level-3-vicuna_13B" --level_2_processed_json_path "$PRED_DIR/short_captions_grounded" --output_dir_path "$PRED_DIR/dense_captions_grounded" # 21. Add Masks to the Annotations (sources: SAM Annotations & EVA Detector) run_in_env_targeted grand_env_utils python utils/add_masks_to_annotations.py --input_dir_path "$PRED_DIR/dense_captions_grounded" --sam_json_dir_path "$SAM_ANNOTATIONS_DIR" --eva_02_pred_dir_path "$PRED_DIR/eva-02-01" --output_dir_path "$PRED_DIR/level-3-processed" # 22. Use HQ-SAM for the Rest of the Masks not Found in SAM Annotations or EVA Detections run_in_env_targeted grand_env_1 pushd utils/hq_sam python -m torch.distributed.launch --nproc_per_node="$NUM_GPUs" --master_port="$MASTER_PORT" --use_env run.py --image_dir_path "$IMG_DIR" --level_3_processed_path "$PRED_DIR/level-3-processed" --output_dir_path "$PRED_DIR/level-3-processed_with_masks" --checkpoints_path "$CKPT_DIR/sam_hq_vit_h.pth" popd # 23. Add Additional Context to the Annotations run_in_env_targeted grand_env_utils python utils/add_addional_context.py --annotations_dir_path "$PRED_DIR/level-3-processed_with_masks" --level_4_additional_context_path "$PRED_DIR/level-4-vicuna_13B" --output_dir_path "$PRED_DIR/level-4-processed" # -------------------------------------------------------------------------------------------------------------------- # echo The pipeline inference completed and the predictions are saved in "$PRED_DIR/level-4-processed"