Merge pull request gpt-engineer-org#1005 from gpt-engineer-org/diff-s…

…yntax-for-improve-command It feels like a great celebration to merge this! Not only is the direct merge apply more reliable when testing things out, but the "chunk self-heal" also works very constructively with me. Thanks a lot for the great work @similato87 !!!
acemi159 · Feb 18, 2024 · a40165e · a40165e
2 parents 43cc0ea + 82cf0e8
commit a40165e
Show file tree

Hide file tree

Showing 24 changed files with 2,388 additions and 527 deletions.
diff --git a/gpt_engineer/applications/cli/file_selector.py b/gpt_engineer/applications/cli/file_selector.py
@@ -352,6 +352,8 @@ def get_current_files(self, project_path: Union[str, Path]) -> List[str]:
                     continue  # Skip hidden fileso
                 if any(part in self.IGNORE_FOLDERS for part in parts):
                     continue
+                if relpath.name == "prompt":
+                    continue  # Skip files named 'prompt'
 
                 all_files.append(str(relpath))
 

diff --git a/gpt_engineer/core/chat_to_files.py b/gpt_engineer/core/chat_to_files.py
@@ -1,206 +1,238 @@
 """
-Chat to Files Module
-
-This module provides utilities to handle and process chat content, especially for extracting code blocks
-and managing them within a specified GPT Engineer project ("workspace"). It offers functionalities like parsing chat messages to
-retrieve code blocks, storing these blocks into a workspace, and overwriting workspace content based on
-new chat messages. Moreover, it aids in formatting and reading file content for an AI agent's input.
-
-Key Features:
-- Parse and extract code blocks from chat messages.
-- Store and overwrite files within a workspace based on chat content.
-- Format files to be used as inputs for AI agents.
-- Retrieve files and their content based on a provided list.
-
-Dependencies:
-- `os` and `pathlib`: For handling OS-level operations and path manipulations.
-- `re`: For regex-based parsing of chat content.
-- `gpt_engineer.core.db`: Database handling functionalities for the workspace.
-- `gpt_engineer.cli.file_selector`: Constants related to file selection.
-
-Functions:
-- chat_to_files_dict(chat: str) -> FilesDict
-    Extracts code blocks from a chat and returns them as a FilesDict object.
-- overwrite_code_with_edits(chat: str, files_dict: FilesDict)
-    Overwrites code with edits extracted from chat.
-- parse_edits(chat: str) -> List[Edit]
-    Parses edits from a chat string and returns them as a list of Edit objects.
-- apply_edits(edits: List[Edit], files_dict: FilesDict)
-    Applies a list of edits to the given code object.
+This Python script provides functionalities for parsing chat transcripts that contain file paths and code blocks,
+applying diffs to these files, and parsing unified git diff format strings. The script is designed to work within
+a larger system that involves processing and manipulating code files based on chat inputs and diff information.
+
+Key Components:
+- chat_to_files_dict: Parses a chat transcript, extracting file paths and associated code blocks, and organizes
+  them into a FilesDict object, which is a custom dictionary format designed to hold file contents keyed by their paths.
+
+- apply_diffs: Takes a dictionary of Diff objects (which represent changes to be made to files) and a FilesDict
+  object containing the current state of files. It applies the changes described by the Diff objects to the
+  corresponding files in the FilesDict, updating the file contents as specified by the diffs.
+
+- parse_diffs: Parses a string containing diffs in the unified git diff format, extracting the changes described
+  in the diffs and organizing them into a dictionary of Diff objects, keyed by the filename to which each diff applies.
+
+- parse_diff_block: Parses a single block of text from a diff string, translating it into a Diff object that
+  represents the changes described in that block of text.
+
+This script is intended for use in environments where code collaboration or review is conducted through chat interfaces,
+allowing for the dynamic application of changes to code bases and the efficient handling of file and diff information in chat transcripts.
 """
 
 import logging
 import re
 
-from dataclasses import dataclass
-from typing import List
+from typing import Dict, Tuple
 
-from gpt_engineer.core.files_dict import FilesDict
+from gpt_engineer.core.diff import ADD, REMOVE, RETAIN, Diff, Hunk
+from gpt_engineer.core.files_dict import FilesDict, file_to_lines_dict
 
+# Initialize a logger for this module
 logger = logging.getLogger(__name__)
 
 
 def chat_to_files_dict(chat: str) -> FilesDict:
     """
-    Extracts all code blocks from a chat and returns them as a FilesDict object.
+    Converts a chat string containing file paths and code blocks into a FilesDict object.
 
-    Parses the chat string to identify and extract code blocks, which are then stored in a FilesDict
-    object with filenames as keys and code content as values.
+    Args:
+    - chat (str): The chat string containing file paths and code blocks.
 
-    Parameters
-    ----------
-    chat : str
-        The chat string to extract code blocks from.
-
-    Returns
-    -------
-    FilesDict
-        A FilesDict object containing the extracted code blocks, with filenames as keys.
+    Returns:
+    - FilesDict: A dictionary with file paths as keys and code blocks as values.
     """
-    # Get all ``` blocks and preceding filenames
+    # Regex to match file paths and associated code blocks
     regex = r"(\S+)\n\s*```[^\n]*\n(.+?)```"
     matches = re.finditer(regex, chat, re.DOTALL)
 
     files_dict = FilesDict()
     for match in matches:
-        # Strip the filename of any non-allowed characters and convert / to \
+        # Clean and standardize the file path
         path = re.sub(r'[\:<>"|?*]', "", match.group(1))
-
-        # Remove leading and trailing brackets
         path = re.sub(r"^\[(.*)\]$", r"\1", path)
-
-        # Remove leading and trailing backticks
         path = re.sub(r"^`(.*)`$", r"\1", path)
-
-        # Remove trailing ]
         path = re.sub(r"[\]\:]$", "", path)
 
-        # Get the code
+        # Extract and clean the code content
         content = match.group(2)
 
-        # Add the file to the list
+        # Add the cleaned path and content to the FilesDict
         files_dict[path.strip()] = content.strip()
 
-    return FilesDict(files_dict)
+    return files_dict
 
 
-def overwrite_code_with_edits(chat: str, files_dict: FilesDict):
+def apply_diffs(diffs: Dict[str, Diff], files: FilesDict) -> FilesDict:
     """
-    Overwrite code with edits extracted from chat.
+    Applies diffs to the provided files.
 
-    Takes a chat string, parses it for edits using the `parse_edits` function, and applies those edits
-    to the provided FilesDict object using the `apply_edits` function.
+    Args:
+    - diffs (Dict[str, Diff]): A dictionary of diffs to apply, keyed by filename.
+    - files (FilesDict): The original files to which diffs will be applied.
 
-    Parameters
-    ----------
-    chat : str
-        The chat content containing code edits.
-    files_dict : FilesDict
-        The FilesDict object to apply edits to.
+    Returns:
+    - FilesDict: The updated files after applying diffs.
     """
-    edits = parse_edits(chat)
-    apply_edits(edits, files_dict)
-
-
-@dataclass
-class Edit:
-    filename: str
-    before: str
-    after: str
-
-
-def parse_edits(chat: str) -> List[Edit]:
+    REMOVE_FLAG = "<REMOVE_LINE>"  # Placeholder to mark lines for removal
+    for diff in diffs.values():
+        if diff.is_new_file():
+            # If it's a new file, create it with the content from the diff
+            files[diff.filename_post] = "\n".join(
+                line[1] for hunk in diff.hunks for line in hunk.lines
+            )
+        else:
+            # Convert the file content to a dictionary of lines
+            line_dict = file_to_lines_dict(files[diff.filename_pre])
+            for hunk in diff.hunks:
+                current_line = hunk.start_line_pre_edit
+                for line in hunk.lines:
+                    if line[0] == RETAIN:
+                        current_line += 1
+                    elif line[0] == ADD:
+                        # Handle added lines
+                        current_line -= 1
+                        if (
+                            current_line in line_dict.keys()
+                            and line_dict[current_line] != REMOVE_FLAG
+                        ):
+                            line_dict[current_line] += "\n" + line[1]
+                        else:
+                            line_dict[current_line] = line[1]
+                        print(
+                            f"\nAdded line {line[1]} to {diff.filename_post} at line {current_line} end"
+                        )
+                        current_line += 1
+                    elif line[0] == REMOVE:
+                        # Mark removed lines with REMOVE_FLAG
+                        line_dict[current_line] = REMOVE_FLAG
+                        print(
+                            f"\nRemoved line {line[1]} from {diff.filename_post} at line {current_line}"
+                        )
+                        current_line += 1
+
+            # Remove lines marked for removal
+            line_dict = {
+                key: line_content
+                for key, line_content in line_dict.items()
+                if REMOVE_FLAG not in line_content
+            }
+            # Reassemble the file content
+            files[diff.filename_post] = "\n".join(line_dict.values())
+    return files
+
+
+def parse_diffs(diff_string: str) -> dict:
     """
-    Parse edits from a chat string.
+    Parses a diff string in the unified git diff format.
 
-    Extracts code edits from a chat string and returns them as a list of Edit objects. Each Edit object
-    contains the filename, the original code block, and the updated code block.
+    Args:
+    - diff_string (str): The diff string to parse.
 
-    Parameters
-    ----------
-    chat : str
-        The chat content containing code edits.
+    Returns:
+    - dict: A dictionary of Diff objects keyed by filename.
+    """
+    # Regex to match individual diff blocks
+    diff_block_pattern = re.compile(
+        r"```.*?\n\s*?--- .*?\n\s*?\+\+\+ .*?\n(?:@@ .*? @@\n(?:[-+ ].*?\n)*?)*?```",
+        re.DOTALL,
+    )
 
-    Returns
-    -------
-    List[Edit]
-        A list of Edit objects representing the parsed code edits.
+    diffs = {}
+    for block in diff_block_pattern.finditer(diff_string):
+        diff_block = block.group()
 
-    Raises
-    ------
-    ValueError
-        If the text cannot be parsed as a code edit.
-    """
+        # Parse individual diff blocks and update the diffs dictionary
+        diffs.update(parse_diff_block(diff_block))
 
-    def parse_one_edit(lines):
-        HEAD = "<<<<<<< HEAD"
-        DIVIDER = "\n=======\n"
-        UPDATE = ">>>>>>> updated"
+    if not diffs:
+        raise ValueError(
+            f"The diff {diff_string} is not a valid diff in the unified git diff format"
+        )
 
-        filename = lines.pop(0)
-        text = "\n".join(lines)
-        splits = text.split(DIVIDER)
-        if len(splits) != 2:
-            raise ValueError(f"Could not parse following text as code edit: \n{text}")
-        before, after = splits
+    return diffs
 
-        before = before.replace(HEAD, "").strip()
-        after = after.replace(UPDATE, "").strip()
 
-        return Edit(filename, before, after)
+def parse_diff_block(diff_block: str) -> dict:
+    """
+    Parses a block of diff text into a Diff object.
 
-    edits = []
-    current_edit = []
-    in_fence = False
+    Args:
+    - diff_block (str): A single block of diff text.
 
-    for line in chat.split("\n"):
-        if line.startswith("```") and in_fence:
-            edits.append(parse_one_edit(current_edit))
-            current_edit = []
-            in_fence = False
-            continue
-        elif line.startswith("```") and not in_fence:
-            in_fence = True
-            continue
+    Returns:
+    - dict: A dictionary containing a single Diff object keyed by the post-edit filename.
+    """
+    lines = diff_block.strip().split("\n")[1:-1]  # Exclude the opening and closing ```
+    diffs = {}
+    current_diff = None
+    hunk_lines = []
+    filename_pre = None
+    filename_post = None
+    hunk_header = None
+
+    for line in lines:
+        if line.startswith("--- "):
+            # Pre-edit filename
+            filename_pre = line[4:]
+        elif line.startswith("+++ "):
+            # Post-edit filename and initiation of a new Diff object
+            if (
+                filename_post is not None
+                and current_diff is not None
+                and hunk_header is not None
+            ):
+                current_diff.hunks.append(Hunk(*hunk_header, hunk_lines))
+                hunk_lines = []
+            filename_post = line[4:]
+            current_diff = Diff(filename_pre, filename_post)
+            diffs[filename_post] = current_diff
+        elif line.startswith("@@ "):
+            # Start of a new hunk in the diff
+            if hunk_lines and current_diff is not None and hunk_header is not None:
+                current_diff.hunks.append(Hunk(*hunk_header, hunk_lines))
+                hunk_lines = []
+            hunk_header = parse_hunk_header(line)
+        elif line.startswith("+"):
+            # Added line
+            hunk_lines.append((ADD, line[1:]))
+        elif line.startswith("-"):
+            # Removed line
+            hunk_lines.append((REMOVE, line[1:]))
+        else:
+            # Retained line
+            hunk_lines.append((RETAIN, line[1:]))
 
-        if in_fence:
-            current_edit.append(line)
+    # Append the last hunk if any
+    if current_diff is not None and hunk_lines and hunk_header is not None:
+        current_diff.hunks.append(Hunk(*hunk_header, hunk_lines))
 
-    return edits
+    return diffs
 
 
-def apply_edits(edits: List[Edit], files_dict: FilesDict):
+def parse_hunk_header(header_line) -> Tuple[int, int, int, int]:
     """
-    Apply a list of edits to the given FilesDict object.
+    Parses the header of a hunk from a diff.
 
-    Takes a list of Edit objects and applies each edit to the FilesDict object. It handles the creation
-    of new files and the modification of existing files based on the edits.
+    Args:
+    - header_line (str): The header line of a hunk.
 
-    Parameters
-    ----------
-    edits : List[Edit]
-        A list of Edit objects representing the code edits to apply.
-    files_dict : FilesDict
-        The FilesDict object to apply edits to.
+    Returns:
+    - tuple: A tuple containing start and length information for pre- and post-edit.
     """
-    for edit in edits:
-        filename = edit.filename
-        if edit.before == "":
-            if filename in files_dict:
-                logger.warning(
-                    f"The edit to be applied wants to create a new file `{filename}`, but that already exists. The file will be overwritten. See `.gpteng/memory` for previous version."
-                )
-            files_dict[filename] = edit.after  # new file
-        else:
-            occurrences_cnt = files_dict[filename].count(edit.before)
-            if occurrences_cnt == 0:
-                logger.warning(
-                    f"While applying an edit to `{filename}`, the code block to be replaced was not found. No instances will be replaced."
-                )
-            if occurrences_cnt > 1:
-                logger.warning(
-                    f"While applying an edit to `{filename}`, the code block to be replaced was found multiple times. All instances will be replaced."
-                )
-            files_dict[filename] = files_dict[filename].replace(
-                edit.before, edit.after
-            )  # existing file
+    pattern = re.compile(r"^@@ -\d{1,},\d{1,} \+\d{1,},\d{1,} @@$")
+
+    if not pattern.match(header_line):
+        # Return a default value if the header does not match the expected format
+        return 0, 0, 0, 0
+
+    pre, post = header_line.split(" ")[1:3]
+    start_line_pre_edit, hunk_len_pre_edit = map(int, pre[1:].split(","))
+    start_line_post_edit, hunk_len_post_edit = map(int, post[1:].split(","))
+    return (
+        start_line_pre_edit,
+        hunk_len_pre_edit,
+        start_line_post_edit,
+        hunk_len_post_edit,
+    )