microsoft · ekzhu · Oct 18, 2024 · Oct 9, 2024 · Oct 9, 2024 · Oct 9, 2024
diff --git a/python/packages/agbench/benchmarks/.gitignore b/python/packages/agbench/benchmarks/.gitignore
@@ -0,0 +1,4 @@
+*/Results/
+*/Tasks/
+*/Downloads/
+*/ENV.json
diff --git a/python/packages/agbench/benchmarks/AssistantBench/.gitignore b/python/packages/agbench/benchmarks/AssistantBench/.gitignore
@@ -0,0 +1 @@
+ENV.json
diff --git a/python/packages/agbench/benchmarks/AssistantBench/ENV.json.sample b/python/packages/agbench/benchmarks/AssistantBench/ENV.json.sample
@@ -0,0 +1,5 @@
+{
+ "BING_API_KEY": "YOUR_KEY_KEY",
+ "HOMEPAGE": "https://www.bing.com/",
+ "WEB_SURFER_DEBUG_DIR": "/autogen/debug"
+}
diff --git a/python/packages/agbench/benchmarks/AssistantBench/README.md b/python/packages/agbench/benchmarks/AssistantBench/README.md
@@ -0,0 +1,78 @@
+# AssistantBench Benchmark
+
+This scenario implements the [AssistantBench](https://assistantbench.github.io/) agent benchmark. Before you begin, make sure you have followed the instructions in `../README.md` to prepare your environment. We modify the evaluation code from AssistantBench in [Scripts](Scripts) and retain the license including it here [LICENSE](Scripts/evaluate_utils/LICENSE). Please find the original AssistantBench evaluation code here [https://huggingface.co/spaces/AssistantBench/leaderboard/tree/main/evaluation](https://huggingface.co/spaces/AssistantBench/leaderboard/tree/main/evaluation).
+
+### Setup Environment Variables for AgBench
+
+Navigate to AssistantBench
+
+```bash
+cd benchmarks/AssistantBench
+```
+
+Create a file called ENV.json with the following (required) contents (If you're using MagenticOne)
+
+```json
+{
+ "BING_API_KEY": "REPLACE_WITH_YOUR_BING_API_KEY",
+ "HOMEPAGE": "https://www.bing.com/",
+ "WEB_SURFER_DEBUG_DIR": "/autogen/debug",
+ "CHAT_COMPLETION_KWARGS_JSON": "{\"api_version\": \"2024-02-15-preview\", \"azure_endpoint\": \"YOUR_ENDPOINT/\", \"model_capabilities\": {\"function_calling\": true, \"json_output\": true, \"vision\": true}, \"azure_ad_token_provider\": \"DEFAULT\", \"model\": \"gpt-4o-2024-05-13\"}",
+ "CHAT_COMPLETION_PROVIDER": "azure"
+}
+```
+
+You can also use the openai client by replacing the last two entries in the ENV file by:
+
+- `CHAT_COMPLETION_PROVIDER='openai'`
+- `CHAT_COMPLETION_KWARGS_JSON` with the following JSON structure:
+
+```json
+{
+ "api_key": "REPLACE_WITH_YOUR_API",
+ "model": "gpt-4o-2024-05-13"
+}
+```
+
+Now initialize the tasks.
+
+```bash
+python Scripts/init_tasks.py
+```
+
+Note: This will attempt to download AssistantBench from Huggingface, but this requires authentication.
+
+After running the script, you should see the new following folders and files:
+
+```
+.
+./Downloads
+./Downloads/AssistantBench
+./Downloads/AssistantBench/assistant_bench_v1.0_dev.jsonl
+./Downloads/AssistantBench/assistant_bench_v1.0_dev.jsonl
+./Tasks
+./Tasks/assistant_bench_v1.0_dev.jsonl
+./Tasks/assistant_bench_v1.0_dev.jsonl
+```
+
+Then run `Scripts/init_tasks.py` again.
+
+Once the script completes, you should now see a folder in your current directory called `Tasks` that contains one JSONL file per template in `Templates`.
+
+### Running AssistantBench
+
+Now to run a specific subset of AssistantBench use:
+
+```bash
+agbench run Tasks/assistant_bench_v1.0_dev__MagenticOne.jsonl
+```
+
+You should see the command line print the raw logs that shows the agents in action To see a summary of the results (e.g., task completion rates), in a new terminal run the following:
+
+```bash
+agbench tabulate Results/assistant_bench_v1.0_dev__MagenticOne
+```
+
+## References
+
+Yoran, Ori, Samuel Joseph Amouyal, Chaitanya Malaviya, Ben Bogin, Ofir Press, and Jonathan Berant. "AssistantBench: Can Web Agents Solve Realistic and Time-Consuming Tasks?." arXiv preprint arXiv:2407.15711 (2024). https://arxiv.org/abs/2407.15711
diff --git a/python/packages/agbench/benchmarks/AssistantBench/Scripts/assistantbench_evaluator.py b/python/packages/agbench/benchmarks/AssistantBench/Scripts/assistantbench_evaluator.py
@@ -0,0 +1,127 @@
+# This Script is slightly modified from the creators of the AssistantBench dataset https://huggingface.co/spaces/AssistantBench/leaderboard/blob/main/evaluation/evaluator.py
+import json
+from evaluate_utils.evaluate_factory import get_evaluator
+import numpy as np
+
+
+def find_isnan(samp):
+ try:
+ if np.isnan(samp):
+ return True
+ else:
+ return False
+ except:
+ return False
+
+
+def fix_ans(answer):
+ try:
+ answer = answer.replace("{'", '{"').replace("', '", '", "').replace("': '", '": "').replace("'}", '"}')
+ answer = answer.replace("': ", '": ')
+ return answer
+ except:
+ return answer
+
+
+def parse_answer(answer):
+ if len(answer) == 1:
+ ans, is_num = fix_number(answer[0])
+ if is_num:
+ return ans, "number"
+ try:
+ ans = json.loads(fix_ans(answer[0]))
+ return [ans], "json"
+ except:
+ ans, is_num = fix_number(answer[0])
+ if is_num:
+ return ans, "number"
+ else:
+ return answer[0], "string"
+ else:
+ try:
+ ans = [json.loads(fix_ans(ex)) for ex in answer]
+ return ans, "json"
+ except:
+ return answer, "string list"
+
+
+def fix_number(number):
+ if type(number) == str:
+ copy_ans = number
+ copy_ans = " ".join(" ".join(" ".join(copy_ans.split("$")).split("%")).split("sqft")).strip()
+ copy_ans = copy_ans.strip()
+ copy_ans = copy_ans.replace(",", ".").replace(" square kilometers", "")
+ try:
+ return float(copy_ans), True
+ except:
+ return number, False
+ elif type(number) == int:
+ return float(number), True
+ else:
+ return number, True
+
+
+def fix_prediction(prediction, gold_answer, evaluator):
+ if (
+ type(prediction) == list
+ and len(prediction) == 1
+ and (type(prediction[0]) == int or ((type(prediction[0]) == str) and prediction[0].isnumeric()))
+ ):
+ prediction = fix_number(prediction[0])
+
+ if type(prediction) != list:
+ prediction, is_num = fix_number(prediction)
+ if evaluator == "json":
+ try:
+ prediction = [json.loads(pred) for pred in prediction.split("\n")]
+ except:
+ prediction = [prediction]
+
+ if (hasattr(type(prediction), "__len__")) and (len(prediction) == 0):
+ return prediction, False
+
+ if (type(prediction) == list and len(prediction) > 1) and type(gold_answer) == float:
+ return prediction, False
+
+ return prediction, True
+
+
+def question_scorer(prediction, gold_answer):
+ """
+ prediction: str or list of str
+ gold_answer: str or list of str
+
+ returns a float between 0 and 1
+ """
+ try:
+ try:
+ prediction = json.loads(prediction)
+ except:
+ prediction = prediction
+
+ answer_list = (
+ [x for x in gold_answer.split("\n") if len(x.strip()) > 0] if type(gold_answer) != list else gold_answer
+ )
+ gold_answer, evaluator = parse_answer(answer_list)
+ prediction, run_eval = fix_prediction(prediction, gold_answer, evaluator)
+
+ has_ans = 1.0
+ if (type(prediction) != float and len(prediction) == 0) or find_isnan(prediction):
+ has_ans = 0.0
+
+ if not run_eval:
+ return 0.0
+
+ metric_eval = get_evaluator(evaluator)
+ accuracy = metric_eval(prediction, gold_answer)
+ # double check if the accuracy is a number between 0 and 1
+ if 0 <= accuracy <= 1:
+ return accuracy
+ else:
+ # throw exception
+ raise ValueError(f"Accuracy should be a float between 0 and 1, but got {accuracy}")
+ except Exception as e:
+ print(
+ f"Something went wrong while evaluating prediction {prediction} vs gold answer {gold_answer} with error {e}"
+ )
+ return 0.0