Revert assertions back to using dicts

gpt-engineer-org · Mar 21, 2024 · 450c7e2 · 450c7e2
1 parent adb60a2
commit 450c7e2
Show file tree

Hide file tree

Showing 3 changed files with 29 additions and 40 deletions.
diff --git a/gpt_engineer/benchmark/benchmarks/apps/load.py b/gpt_engineer/benchmark/benchmarks/apps/load.py
@@ -10,7 +10,6 @@
 load_apps : function
  Loads the APPS benchmark, which consists of a series coding problems.
 """
-from collections import OrderedDict
 from pathlib import Path
 from subprocess import TimeoutExpired
 from typing import Union
@@ -21,6 +20,7 @@
 from gpt_engineer.benchmark.benchmarks.apps.problems import PROBLEM_IDS
 from gpt_engineer.benchmark.types import Assertable, Benchmark, Task
 from gpt_engineer.core.files_dict import FilesDict
+from gpt_engineer.core.prompt import Prompt
 
 DATASET_PATH = Path("gpt_engineer/benchmark/benchmarks/apps/dataset")
 MAX_N_TEST_EXAMPLES = 10
@@ -82,29 +82,26 @@ def load_apps():
  ]
 
  for problem in problems:
+ prompt = Prompt(
+ problem.question
+ + "\nThe program, including its inputs, should be run from the command "
+ "line like 'python main \"input1 input2 etc \"', with all inputs inside "
+ "the quotation marks. The program should not read inputs from stdin."
+ )
+
  tasks.append(
  Task(
  name=str(problem.id),
  initial_code=FilesDict({"main.py": problem.starter_code}),
  command=None, # Explicitly setting `None` because each assertion specifies its command
- prompt=problem.question
- + "\nThe program, including its inputs, should be run from the command "
- "line like 'python main \"input1 input2 etc \"', with all inputs inside "
- "the quotation marks. The program should not read inputs from stdin.",
- assertions=[
- OrderedDict(
- {
- "correct output": AppsAssertion(
- expected=problem.outputs[i],
- command="python main.py"
- + ' "'
- + problem.inputs[i]
- + '"',
- ).evaluate
- }
- )
+ prompt=prompt,
+ assertions={
+ f"correct output {i}": AppsAssertion(
+ expected=problem.outputs[i],
+ command="python main.py" + ' "' + problem.inputs[i] + '"',
+ ).evaluate
  for i in range(min(len(problem.outputs), MAX_N_TEST_EXAMPLES))
- ],
+ },
  )
  )
 

diff --git a/gpt_engineer/benchmark/run.py b/gpt_engineer/benchmark/run.py
@@ -59,7 +59,7 @@ def run(
  TaskResult(
  task_name=task.name,
  duration=time.time() - t0,
- assertion_results=[],
+ assertion_results={},
  )
  )
  continue
@@ -86,13 +86,10 @@ def run(
  task_results.append(
  TaskResult(
  task_name=task.name,
- assertion_results=[
- {
- key: assertion(exec_result)
- for key, assertion in task.assertions[i].items()
- }
- for i in range(len(task.assertions))
- ],
+ assertion_results={
+ assertion_name: assertion(exec_result)
+ for assertion_name, assertion in task.assertions.items()
+ },
  duration=t1 - t0,
  )
  )
@@ -118,11 +115,9 @@ def print_results(results: list[TaskResult]):
  for task_result in results:
  print(f"\n--- Results for {task_result.task_name} ---")
  print(f"{task_result.task_name} ({task_result.duration:.2f}s)")
- for assertion_results_dict in task_result.assertion_results:
- for assertion_name, assertion_result in assertion_results_dict.items():
- checkmark = "✅" if assertion_result else "❌"
- print(f" {checkmark} {assertion_name}")
- print()
+ for assertion_name, assertion_result in task_result.assertion_results.items():
+ checkmark = "✅" if assertion_result else "❌"
+ print(f" {checkmark} {assertion_name}")
  print()
 
  success_rates = [task_result.success_rate for task_result in results]
@@ -133,15 +128,12 @@ def print_results(results: list[TaskResult]):
  correct_assertions = sum(
  sum(
  assertion_result
- for assertion_results_dict in task_result.assertion_results
- for assertion_result in assertion_results_dict.values()
+ for assertion_result in task_result.assertion_results.values()
  )
  for task_result in results
  )
  total_assertions = sum(
- len(assertion_results_dict)
- for task_result in results
- for assertion_results_dict in task_result.assertion_results
+ len(task_result.assertion_results) for task_result in results
  )
  correct_tasks = [
  task_result for task_result in results if task_result.success_rate == 1

diff --git a/gpt_engineer/benchmark/types.py b/gpt_engineer/benchmark/types.py
@@ -22,7 +22,7 @@
 """
 from dataclasses import dataclass
 from subprocess import Popen
-from typing import Callable, List, Optional, OrderedDict
+from typing import Callable, Dict, Optional
 
 from gpt_engineer.core.base_execution_env import BaseExecutionEnv
 from gpt_engineer.core.files_dict import FilesDict
@@ -58,7 +58,7 @@ class Task:
  initial_code: Optional[FilesDict]
  command: Optional[str]
  prompt: Prompt
- assertions: Optional[List[OrderedDict[str, Assertion]]]
+ assertions: Optional[Dict[str, Assertion]]
 
 
 @dataclass
@@ -73,14 +73,14 @@ class Benchmark:
 @dataclass
 class TaskResult:
  task_name: str
- assertion_results: List[dict[str, bool]]
+ assertion_results: dict[str, bool]
  duration: float
 
  # Returns success rate from 0.00 up to 1.00
  @property
  def success_rate(self) -> float:
  succeeded = len(
- [result for result in self.assertion_results if list(result.values())[0]]
+ [result for result in self.assertion_results.values() if result is True]
  )
 
  return succeeded / len(self.assertion_results)