gpt-engineer-org · ATheorell · Mar 23, 2024 · Feb 19, 2024 · Feb 21, 2024 · Feb 28, 2024
diff --git a/.gitignore b/.gitignore
@@ -87,3 +87,6 @@ webapp/.next/
 
 #ignore tox files
 .tox
+
+# locally saved datasets
+gpt_engineer/benchmark/benchmarks/apps/dataset
diff --git a/gpt_engineer/benchmark/benchmarks/apps/load.py b/gpt_engineer/benchmark/benchmarks/apps/load.py
@@ -0,0 +1,114 @@
+"""
+Module for loading APPS evaluation tasks.
+
+This module provides functionality to load tasks for evaluating GPT-based models
+on smaller, more focused tasks. It defines a set of tasks with predefined prompts
+and assertions to benchmark the performance of AI models.
+
+Functions
+---------
+load_apps : function
+ Loads the APPS benchmark, which consists of a series coding problems.
+"""
+from collections import OrderedDict
+from pathlib import Path
+from subprocess import TimeoutExpired
+from typing import Union
+
+from datasets import Dataset, DatasetDict, load_dataset, load_from_disk
+
+from gpt_engineer.benchmark.benchmarks.apps.problem import Problem
+from gpt_engineer.benchmark.benchmarks.apps.problems import PROBLEM_IDS
+from gpt_engineer.benchmark.types import Assertable, Benchmark, Task
+from gpt_engineer.core.files_dict import FilesDict
+
+DATASET_PATH = Path("gpt_engineer/benchmark/benchmarks/apps/dataset")
+MAX_N_TEST_EXAMPLES = 10
+
+
+class AppsAssertion:
+ def __init__(self, expected: str, command: str):
+ self.expected_output = self._format(expected)
+ self.command = command
+
+ def evaluate(self, assertable: Assertable) -> bool:
+ pro = assertable.env.popen(self.command)
+ try:
+ stdout, stderr = pro.communicate(timeout=2)
+ stdout, stderr = stdout.decode("utf-8"), stderr.decode("utf-8")
+ except TimeoutExpired:
+ print("Execution Timeout")
+ return False
+
+ return self.expected_output in self._format(stdout)
+
+ def _format(self, string: str) -> str:
+ return string.replace(" ", "").replace("\n", "")
+
+
+def _get_dataset() -> Union[Dataset, DatasetDict]:
+ try:
+ return load_from_disk(str(DATASET_PATH))
+ except FileNotFoundError:
+ print("Dataset not found locally, downloading...")
+
+ dataset = load_dataset("codeparrot/apps")
+ dataset.save_to_disk(DATASET_PATH)
+
+ return dataset
+
+
+def load_apps():
+ """
+ Loads the APPS benchmark, which consists of a series coding problems.
+
+ Returns
+ -------
+ Benchmark
+ A Benchmark object containing a list of Task objects for the APPS evaluation.
+ """
+ dataset = _get_dataset()
+ tasks = []
+
+ problems = [
+ Problem(
+ id=problem["problem_id"],
+ question=problem["question"],
+ input_output=problem["input_output"],
+ starter_code=problem["starter_code"],
+ )
+ for problem in dataset["test"]
+ if problem["problem_id"] in PROBLEM_IDS
+ ]
+
+ for problem in problems:
+ tasks.append(
+ Task(
+ name=str(problem.id),
+ initial_code=FilesDict({"main.py": problem.starter_code}),
+ command=None, # Explicitly setting `None` because each assertion specifies its command
+ prompt=problem.question
+ + "\nThe program, including its inputs, should be run from the command "
+ "line like 'python main \"input1 input2 etc \"', with all inputs inside "
+ "the quotation marks. The program should not read inputs from stdin.",
+ assertions=[
+ OrderedDict(
+ {
+ "correct output": AppsAssertion(
+ expected=problem.outputs[i],
+ command="python main.py"
+ + ' "'
+ + problem.inputs[i]
+ + '"',
+ ).evaluate
+ }
+ )
+ for i in range(min(len(problem.outputs), MAX_N_TEST_EXAMPLES))
+ ],
+ )
+ )
+
+ return Benchmark(
+ name="APPS",
+ tasks=tasks,
+ )
diff --git a/gpt_engineer/benchmark/benchmarks/apps/problem.py b/gpt_engineer/benchmark/benchmarks/apps/problem.py
@@ -0,0 +1,25 @@
+import json
+
+from dataclasses import dataclass
+from functools import cached_property
+from typing import List
+
+
+@dataclass(frozen=True)
+class Problem:
+ id: int
+ question: str
+ input_output: str
+ starter_code: str
+
+ @property
+ def inputs(self) -> List[str]:
+ return self._parsed_inputs_outputs["inputs"]
+
+ @property
+ def outputs(self) -> List[str]:
+ return self._parsed_inputs_outputs["outputs"]
+
+ @cached_property
+ def _parsed_inputs_outputs(self):
+ return json.loads(self.input_output.replace("\n", ""))
diff --git a/gpt_engineer/benchmark/benchmarks/apps/problems.py b/gpt_engineer/benchmark/benchmarks/apps/problems.py
@@ -0,0 +1,9 @@
+INTRODUCTORY_PROBLEM_IDS = list(range(2500, 2600))
+
+INTERVIEW_PROBLEM_IDS = list(range(0, 100))
+
+COMPETITION_PROBLEM_IDS = list()
+
+# TODO: Pick problems
+# Temporary testing against these problems
+PROBLEM_IDS = list(range(0, 50))
diff --git a/gpt_engineer/benchmark/benchmarks/load.py b/gpt_engineer/benchmark/benchmarks/load.py
@@ -9,14 +9,15 @@
 get_benchmark : function
  Retrieves a Benchmark object by name. Raises ValueError if the benchmark is unknown.
 """
-
+from gpt_engineer.benchmark.benchmarks.apps.load import load_apps
 from gpt_engineer.benchmark.benchmarks.gpteng.load import load_gpteng
 from gpt_engineer.benchmark.benchmarks.gptme.load import load_gptme
 from gpt_engineer.benchmark.types import Benchmark
 
 BENCHMARKS = {
  "gptme": load_gptme,
  "gpteng": load_gpteng,
+ "apps": load_apps,
 }
 
 

diff --git a/gpt_engineer/benchmark/run.py b/gpt_engineer/benchmark/run.py
@@ -18,6 +18,7 @@
 
 from gpt_engineer.benchmark.types import Assertable, Benchmark, TaskResult
 from gpt_engineer.core.base_agent import BaseAgent
+from gpt_engineer.core.chat_to_files import DiffError
 from gpt_engineer.core.default.disk_execution_env import DiskExecutionEnv
 
 
@@ -48,8 +49,20 @@ def run(
  """
  task_results = []
  for task in benchmark.tasks:
+ print(f"--> Running task: {task.name}\n")
+
  t0 = time.time()
- files_dict = agent.improve(task.initial_code, task.prompt, task.command)
+ try:
+ files_dict = agent.improve(task.initial_code, task.prompt)
+ except DiffError: # Temporary catch errors related to git diffs
+ task_results.append(
+ TaskResult(
+ task_name=task.name,
+ duration=time.time() - t0,
+ assertion_results=[],
+ )
+ )
+ continue
  t1 = time.time()
 
  env = DiskExecutionEnv()
@@ -61,6 +74,7 @@ def run(
  stdout, stderr = stdout.decode("utf-8"), stderr.decode("utf-8")
  else:
  p, stdout, stderr = None, None, None
+
  exec_result = Assertable(
  files=files_dict,
  env=env,
@@ -72,13 +86,17 @@ def run(
  task_results.append(
  TaskResult(
  task_name=task.name,
- assertion_results={
- assertion_name: assertion(exec_result)
- for assertion_name, assertion in task.assertions.items()
- },
+ assertion_results=[
+ {
+ key: assertion(exec_result)
+ for key, assertion in task.assertions[i].items()
+ }
+ for i in range(len(task.assertions))
+ ],
  duration=t1 - t0,
  )
  )
+
  if verbose:
  print_results(task_results)
  return task_results
@@ -100,32 +118,39 @@ def print_results(results: list[TaskResult]):
  for task_result in results:
  print(f"\n--- Results for {task_result.task_name} ---")
  print(f"{task_result.task_name} ({task_result.duration:.2f}s)")
- for assertion_name, assertion_result in task_result.assertion_results.items():
- checkmark = "✅" if assertion_result else "❌"
- print(f" {checkmark} {assertion_name}")
+ for assertion_results_dict in task_result.assertion_results:
+ for assertion_name, assertion_result in assertion_results_dict.items():
+ checkmark = "✅" if assertion_result else "❌"
+ print(f" {checkmark} {assertion_name}")
+ print()
  print()
 
+ success_rates = [task_result.success_rate for task_result in results]
+ avg_success_rate = sum(success_rates) / len(results)
+
  total_time = sum(task_result.duration for task_result in results)
- print(f"Total time: {total_time:.2f}s")
 
  correct_assertions = sum(
  sum(
  assertion_result
- for assertion_result in task_result.assertion_results.values()
+ for assertion_results_dict in task_result.assertion_results
+ for assertion_result in assertion_results_dict.values()
  )
  for task_result in results
  )
  total_assertions = sum(
- len(task_result.assertion_results) for task_result in results
- )
- print(f"Total correct assertions: {correct_assertions}/{total_assertions}")
-
- correct_tasks = sum(
- all(
- assertion_result
- for assertion_result in task_result.assertion_results.values()
- )
+ len(assertion_results_dict)
  for task_result in results
+ for assertion_results_dict in task_result.assertion_results
  )
- print(f"Correct tasks: {correct_tasks}/{len(results)}")
+ correct_tasks = [
+ task_result for task_result in results if task_result.success_rate == 1
+ ]
+
+ print("--- Results ---")
+ print(f"Total time: {total_time:.2f}s")
+ print(f"Completely correct tasks: {len(correct_tasks)}/{len(results)}")
+ print(f"Total correct assertions: {correct_assertions}/{total_assertions}")
+ print(f"Average success rate: {avg_success_rate * 100}% on {len(results)} tasks")
+ print("--- Results ---")
  print()
diff --git a/gpt_engineer/benchmark/types.py b/gpt_engineer/benchmark/types.py
@@ -22,7 +22,7 @@
 """
 from dataclasses import dataclass
 from subprocess import Popen
-from typing import Callable, Dict, Optional
+from typing import Callable, List, Optional, OrderedDict
 
 from gpt_engineer.core.base_execution_env import BaseExecutionEnv
 from gpt_engineer.core.files_dict import FilesDict
@@ -57,7 +57,7 @@ class Task:
  initial_code: Optional[FilesDict]
  command: Optional[str]
  prompt: str
- assertions: Optional[Dict[str, Assertion]]
+ assertions: Optional[List[OrderedDict[str, Assertion]]]
 
 
 @dataclass
@@ -72,5 +72,14 @@ class Benchmark:
 @dataclass
 class TaskResult:
  task_name: str
- assertion_results: dict[str, bool]
+ assertion_results: List[dict[str, bool]]
  duration: float
+
+ # Returns success rate from 0.00 up to 1.00
+ @property
+ def success_rate(self) -> float:
+ succeeded = len(
+ [result for result in self.assertion_results if list(result.values())[0]]
+ )
+
+ return succeeded / len(self.assertion_results)
diff --git a/gpt_engineer/core/chat_to_files.py b/gpt_engineer/core/chat_to_files.py
@@ -26,13 +26,19 @@
 
 from typing import Dict, Tuple
 
+import regex
+
 from gpt_engineer.core.diff import ADD, REMOVE, RETAIN, Diff, Hunk
 from gpt_engineer.core.files_dict import FilesDict, file_to_lines_dict
 
 # Initialize a logger for this module
 logger = logging.getLogger(__name__)
 
 
+class DiffError(ValueError):
+ pass
+
+
 def chat_to_files_dict(chat: str) -> FilesDict:
  """
  Converts a chat string containing file paths and code blocks into a FilesDict object.
@@ -134,21 +140,25 @@
  - dict: A dictionary of Diff objects keyed by filename.
  """
  # Regex to match individual diff blocks
- diff_block_pattern = re.compile(
+ diff_block_pattern = regex.compile(
  r"```.*?\n\s*?--- .*?\n\s*?\+\+\+ .*?\n(?:@@ .*? @@\n(?:[-+ ].*?\n)*?)*?```",
  re.DOTALL,
  )
 
  diffs = {}
- for block in diff_block_pattern.finditer(diff_string):
- diff_block = block.group()
+ try:
+ for block in diff_block_pattern.finditer(diff_string, timeout=1):
+ diff_block = block.group()
 
- # Parse individual diff blocks and update the diffs dictionary
- diffs.update(parse_diff_block(diff_block))
+ # Parse individual diff blocks and update the diffs dictionary
+ diffs.update(parse_diff_block(diff_block))
+ except TimeoutError:
+ raise DiffError("`diff_block_pattern.finditer` has timed out")
 
  if not diffs:
- print(
- "GPT did not provide any proposed changes. Please try to reselect the files for uploading and edit your prompt file."
+ raise DiffError(
+ "GPT did not provide any proposed changes. "
+ "Please try to reselect the files for uploading and edit your prompt file."
  )
 
  return diffs