bigai-nlco · Jul 15, 2024
diff --git a/‎.gitignore
+4 b/‎.gitignore
+4
diff --git a/‎README.md
+103-1 b/‎README.md
+103-1
diff --git a/‎baselines/refcoco_baseline.ipynb
+188 b/‎baselines/refcoco_baseline.ipynb
+188
@@ -1,3 +1,7 @@
+# this file
+*.png
+*.jpg
+
 # Byte-compiled / optimized / DLL files
 __pycache__/
 *.py[cod]
 
@@ -1 +1,103 @@
-# ExoViP
+<div align="center">
+
+# ExoViP: Step-by-step Verification and Exploration with Exoskeleton Modules for Compositional Visual Reasoning
+
+[![arXiv](https://img.shields.io/badge/arXiv-<INDEX>-b31b1b.svg)](https://arxiv.org/abs/<INDEX>)
+[![Conference](http://img.shields.io/badge/COLM-2024-4b44ce.svg)](https://colmweb.org/)
+
+</div>
+
+Official implementation of our paper: ExoViP: Step-by-step Verification and Exploration with Exoskeleton Modules for Compositional Visual Reasoning
+
+![image](assets/framework.png)
+
+## Introduction
+
+In this work, we devise a "plug-and-play" method, ExoViP, to correct the errors at both the planning and execution stages through introspective verification. We employ verification modules as "exoskeletons" to enhance current vision-language programming schemes. Specifically, our proposed verification module utilizes a mixture of three sub-verifiers to validate predictions after each reasoning step, subsequently calibrating the visual module predictions and refining the reasoning trace planned by LLMs.
+
+## Envirionment
+
+Paste your OPENAI-API-KEY and OPENAPI-API-BASE to `engine/.env` and `tasks/*.ipynb`
+
+```
+conda env create -f environment.yaml
+conda activate exovip
+```
+
+If the Huggingface is not available of your network, you can download all checkpoints under `prev_trained_models` directory
+
+## Highlights
+
+Errors in existing methods could be summarized to two  categories:
+
+- Module Error: The visual modules are not able to correctly execute the program
+- Planning Error: LLM can not parse the language query into a correct solvable program
+
+![image](assets/error.png)
+
+We conducted a comparative analysis of the statistics derived from a random sample of 100 failure incidents before (left) and after (right) the implementation of our method.
+
+![image](assets/stat.png)
+
+## Start
+
+Our method has been validated on six tasks:
+
+- Compositional Image Question Answering: [GQA](https://cs.stanford.edu/people/dorarad/gqa/about.html)
+- Referring Expression Understanding: [RefCOCO/RefCOCO+/RefCOCOg](https://github.com/lichengunc/refer)
+- Natural Language for Visual Reasoning: [NLVR](https://github.com/lil-lab/nlvr/tree/master/nlvr2)
+- Visual Abstract Reasoning: [KILOGRAM](https://github.com/lil-lab/kilogram)
+- Language-guided Image Editing: [MagicBrush](https://github.com/OSU-NLP-Group/MagicBrush)
+- Spatial-Temporal Video Reasoning: [AGQA](http://ai.stanford.edu/blog/agqa/)
+
+***NOTE**: All the experiments are applied on subsets of these datasets, please refer to `datasets`*
+
+code demos
+
+```bash
+cd tasks
+
+# GQA
+gqa.ipynb
+
+# NLVR
+nlvr.ipynb
+
+# RefCOCO(+/g)
+refcoco.ipynb
+
+# KILOGRAM
+kilogram.ipynb
+
+# MagicBrush
+magicbrush.ipynb
+
+# AGQA
+agqa.ipynb
+```
+
+## Available Modules
+
+![image](assets/modules.png)
+
+## Examples
+
+![image](assets/GQA.png)
+
+## Acknowledgement
+
+[visprog](https://github.com/allenai/visprog), a neuro-symbolic system that solves complex and compositional visual tasks given natural language instructions
+
+
+## Citation
+
+If you find our work helpful, please cite it.
+
+```bibtex
+@article{videohallucer,
+    title={ExoViP: Step-by-step Verification and Exploration with Exoskeleton Modules for Compositional Visual Reasoning},
+    author={Yuxuan Wang, Alan Yuille, Zhuowan Li, Zilong Zheng},
+    journal={COLM 2024},
+    year={2024}
+}
+```
@@ -0,0 +1,188 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "import re\n",
+    "import sys\n",
+    "import json\n",
+    "from pathlib import Path\n",
+    "module_path = os.path.abspath(os.path.join('..'))\n",
+    "if module_path not in sys.path:\n",
+    "    sys.path.append(module_path)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from PIL import Image\n",
+    "from IPython.core.display import HTML\n",
+    "from functools import partial\n",
+    "\n",
+    "from transformers import AutoModelForCausalLM, AutoTokenizer\n",
+    "from transformers.generation import GenerationConfig\n",
+    "import torch"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Note: The default behavior now has injection attack prevention off.\n",
+    "tokenizer = AutoTokenizer.from_pretrained(\"Qwen/Qwen-VL-Chat\", trust_remote_code=True)\n",
+    "\n",
+    "# use bf16\n",
+    "# model = AutoModelForCausalLM.from_pretrained(\"Qwen/Qwen-VL-Chat\", device_map=\"auto\", trust_remote_code=True, bf16=True).eval()\n",
+    "# use fp16\n",
+    "# model = AutoModelForCausalLM.from_pretrained(\"Qwen/Qwen-VL-Chat\", device_map=\"auto\", trust_remote_code=True, fp16=True).eval()\n",
+    "# use cpu only\n",
+    "# model = AutoModelForCausalLM.from_pretrained(\"Qwen/Qwen-VL-Chat\", device_map=\"cpu\", trust_remote_code=True).eval()\n",
+    "# use cuda device\n",
+    "model = AutoModelForCausalLM.from_pretrained(\"Qwen/Qwen-VL-Chat\", device_map=\"cuda\", trust_remote_code=True).eval()\n",
+    "\n",
+    "# Specify hyperparameters for generation\n",
+    "model.generation_config = GenerationConfig.from_pretrained(\"Qwen/Qwen-VL-Chat\", trust_remote_code=True)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from tqdm import tqdm\n",
+    "from PIL import ImageDraw\n",
+    "test_file = os.path.join(Path.home(), 'codes/ExoViP/datasets/refcoco/test.json')\n",
+    "with open(test_file) as jp:\n",
+    "    test = json.load(jp)\n",
+    "eval_pred = 0\n",
+    "eval_cnt = 0\n",
+    "\n",
+    "for idx, dct in tqdm(test.items()):\n",
+    "    # eval_cnt += 1\n",
+    "    # if eval_cnt < 5: continue\n",
+    "    \n",
+    "    img_id = dct['img']\n",
+    "    img_path = os.path.join(Path.home(), 'codes/ExoViP/datasets/refcoco/imgs', img_id)\n",
+    "    image = Image.open(img_path)\n",
+    "    h, w = image.height, image.width\n",
+    "    \n",
+    "    instruction = dct['instruction']\n",
+    "    # print(instruction)\n",
+    "    \n",
+    "    query = tokenizer.from_list_format([\n",
+    "        {\"image\": img_path,\n",
+    "         \"text\": instruction}\n",
+    "    ])\n",
+    "    \n",
+    "    response, history = model.chat(tokenizer, query=query, history=None)\n",
+    "    # image = tokenizer.draw_bbox_on_latest_picture(response, history)\n",
+    "    # image.save(str(eval_cnt)+'.jpg')\n",
+    "    # display(image)\n",
+    "    PATTERN = re.compile(r'\\((.*?)\\),\\((.*?)\\)')\n",
+    "    predict_bbox = re.findall(PATTERN, response)\n",
+    "    try:\n",
+    "        if ',' not in predict_bbox[0][0] or ',' not in predict_bbox[0][\n",
+    "                1]:\n",
+    "            predict_bbox = (0., 0., 0., 0.)\n",
+    "        else:\n",
+    "            x1, y1 = [\n",
+    "                float(tmp) for tmp in predict_bbox[0][0].split(',')\n",
+    "            ]\n",
+    "            x2, y2 = [\n",
+    "                float(tmp) for tmp in predict_bbox[0][1].split(',')\n",
+    "            ]\n",
+    "            \n",
+    "            # x1, y1, x2, y2 = box['box']\n",
+    "            x1, y1, x2, y2 = (int(x1 / 1000 * w), int(y1 / 1000 * h), int(x2 / 1000 * w), int(y2 / 1000 * h))\n",
+    "            predict_bbox = (x1, y1, x2, y2)\n",
+    "    except:\n",
+    "        predict_bbox = (0., 0., 0., 0.)\n",
+    "    box = predict_bbox\n",
+    "    label = dct['box']\n",
+    "    # print(box)\n",
+    "    # print(label)\n",
+    "    # print()\n",
+    "    # draw = ImageDraw.Draw(image)\n",
+    "    # draw.rectangle(box,outline='red',width=4)\n",
+    "    # draw.rectangle(label,outline='green',width=4)\n",
+    "    # image.save(str(eval_cnt)+'.jpg')\n",
+    "    \n",
+    "    # calculate iou\n",
+    "    label_area = (label[2]-label[0]) * (label[3] - label[1])\n",
+    "    box_area = (box[2]-box[0]) * (box[3] - box[1])\n",
+    "    x1 = max(box[0], label[0])\n",
+    "    x2 = min(box[2], label[2])\n",
+    "    y1 = max(box[1], label[1])\n",
+    "    y2 = min(box[3], label[3])\n",
+    "    intersection = max(0, x2-x1) * max(0, y2-y1)\n",
+    "    iou = intersection / (label_area + box_area - intersection)\n",
+    "    # print(iou)\n",
+    "    eval_pred += iou\n",
+    "    eval_cnt += 1\n",
+    "    \n",
+    "    \n",
+    "    # # visualize\n",
+    "    # # W,H=image.size\n",
+    "    # draw = ImageDraw.Draw(result)\n",
+    "    # draw.rectangle(label,outline='red',width=4)\n",
+    "    # result.save(f'{idx}.jpg')\n",
+    "    # print(idx, instruction)\n",
+    "    # if eval_cnt > 5:\n",
+    "    #     break\n",
+    "    \n",
+    "    if eval_cnt % 20 == 0:\n",
+    "        print(f'step {eval_cnt} iou: ', round(eval_pred/eval_cnt, 2))\n",
+    "        # break\n",
+    "\n",
+    "print('iou: ', eval_pred/len(test.keys()))\n",
+    "result_file = os.path.join(Path.home(), 'codes/visprog/results/refcoco/qwen.json')\n",
+    "with open(result_file, 'w') as jp:\n",
+    "    json.dump(test, jp)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3.10.4 ('few-shot-vr')",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.13"
+  },
+  "orig_nbformat": 4,
+  "vscode": {
+   "interpreter": {
+    "hash": "f6aae81381dc24e2fd0d8778e266667bb8dbd7e1c04425e21584f774a2d20c40"
+   }
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}