Skip to content

Commit

Permalink
Revert assertions back to using dicts
Browse files Browse the repository at this point in the history
  • Loading branch information
azrv committed Mar 21, 2024
1 parent adb60a2 commit 450c7e2
Show file tree
Hide file tree
Showing 3 changed files with 29 additions and 40 deletions.
33 changes: 15 additions & 18 deletions gpt_engineer/benchmark/benchmarks/apps/load.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@
load_apps : function
Loads the APPS benchmark, which consists of a series coding problems.
"""
from collections import OrderedDict
from pathlib import Path
from subprocess import TimeoutExpired
from typing import Union
Expand All @@ -21,6 +20,7 @@
from gpt_engineer.benchmark.benchmarks.apps.problems import PROBLEM_IDS
from gpt_engineer.benchmark.types import Assertable, Benchmark, Task
from gpt_engineer.core.files_dict import FilesDict
from gpt_engineer.core.prompt import Prompt

DATASET_PATH = Path("gpt_engineer/benchmark/benchmarks/apps/dataset")
MAX_N_TEST_EXAMPLES = 10
Expand Down Expand Up @@ -82,29 +82,26 @@ def load_apps():
]

for problem in problems:
prompt = Prompt(
problem.question
+ "\nThe program, including its inputs, should be run from the command "
"line like 'python main \"input1 input2 etc \"', with all inputs inside "
"the quotation marks. The program should not read inputs from stdin."
)

tasks.append(
Task(
name=str(problem.id),
initial_code=FilesDict({"main.py": problem.starter_code}),
command=None, # Explicitly setting `None` because each assertion specifies its command
prompt=problem.question
+ "\nThe program, including its inputs, should be run from the command "
"line like 'python main \"input1 input2 etc \"', with all inputs inside "
"the quotation marks. The program should not read inputs from stdin.",
assertions=[
OrderedDict(
{
"correct output": AppsAssertion(
expected=problem.outputs[i],
command="python main.py"
+ ' "'
+ problem.inputs[i]
+ '"',
).evaluate
}
)
prompt=prompt,
assertions={
f"correct output {i}": AppsAssertion(
expected=problem.outputs[i],
command="python main.py" + ' "' + problem.inputs[i] + '"',
).evaluate
for i in range(min(len(problem.outputs), MAX_N_TEST_EXAMPLES))
],
},
)
)

Expand Down
28 changes: 10 additions & 18 deletions gpt_engineer/benchmark/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ def run(
TaskResult(
task_name=task.name,
duration=time.time() - t0,
assertion_results=[],
assertion_results={},
)
)
continue
Expand All @@ -86,13 +86,10 @@ def run(
task_results.append(
TaskResult(
task_name=task.name,
assertion_results=[
{
key: assertion(exec_result)
for key, assertion in task.assertions[i].items()
}
for i in range(len(task.assertions))
],
assertion_results={
assertion_name: assertion(exec_result)
for assertion_name, assertion in task.assertions.items()
},
duration=t1 - t0,
)
)
Expand All @@ -118,11 +115,9 @@ def print_results(results: list[TaskResult]):
for task_result in results:
print(f"\n--- Results for {task_result.task_name} ---")
print(f"{task_result.task_name} ({task_result.duration:.2f}s)")
for assertion_results_dict in task_result.assertion_results:
for assertion_name, assertion_result in assertion_results_dict.items():
checkmark = "✅" if assertion_result else "❌"
print(f" {checkmark} {assertion_name}")
print()
for assertion_name, assertion_result in task_result.assertion_results.items():
checkmark = "✅" if assertion_result else "❌"
print(f" {checkmark} {assertion_name}")
print()

success_rates = [task_result.success_rate for task_result in results]
Expand All @@ -133,15 +128,12 @@ def print_results(results: list[TaskResult]):
correct_assertions = sum(
sum(
assertion_result
for assertion_results_dict in task_result.assertion_results
for assertion_result in assertion_results_dict.values()
for assertion_result in task_result.assertion_results.values()
)
for task_result in results
)
total_assertions = sum(
len(assertion_results_dict)
for task_result in results
for assertion_results_dict in task_result.assertion_results
len(task_result.assertion_results) for task_result in results
)
correct_tasks = [
task_result for task_result in results if task_result.success_rate == 1
Expand Down
8 changes: 4 additions & 4 deletions gpt_engineer/benchmark/types.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
"""
from dataclasses import dataclass
from subprocess import Popen
from typing import Callable, List, Optional, OrderedDict
from typing import Callable, Dict, Optional

from gpt_engineer.core.base_execution_env import BaseExecutionEnv
from gpt_engineer.core.files_dict import FilesDict
Expand Down Expand Up @@ -58,7 +58,7 @@ class Task:
initial_code: Optional[FilesDict]
command: Optional[str]
prompt: Prompt
assertions: Optional[List[OrderedDict[str, Assertion]]]
assertions: Optional[Dict[str, Assertion]]


@dataclass
Expand All @@ -73,14 +73,14 @@ class Benchmark:
@dataclass
class TaskResult:
task_name: str
assertion_results: List[dict[str, bool]]
assertion_results: dict[str, bool]
duration: float

# Returns success rate from 0.00 up to 1.00
@property
def success_rate(self) -> float:
succeeded = len(
[result for result in self.assertion_results if list(result.values())[0]]
[result for result in self.assertion_results.values() if result is True]
)

return succeeded / len(self.assertion_results)

0 comments on commit 450c7e2

Please sign in to comment.