ls1intum · FelixTJDietrich · Sep 12, 2024 · Aug 31, 2024 · Sep 6, 2024 · Sep 6, 2024
diff --git a/.vscode/launch.json b/.vscode/launch.json
@@ -26,7 +26,7 @@
             "request": "launch",
             "cwd": "${workspaceFolder}/modules/programming/module_programming_llm",
             "module": "module_programming_llm",
-            "justMyCode": true
+            "justMyCode": false
         },
         {
             "name": "Module Programming ThemisML",
@@ -62,7 +62,7 @@
             "type": "python",
             "request": "launch",
             "cwd": "${workspaceFolder}/modules/modeling/module_modeling_llm",
-            "module": "module_text_cofee"
+            "module": "module_modeling_llm"
         }
     ]
 }

diff --git a/assessment_module_manager/modules.docker.ini b/assessment_module_manager/modules.docker.ini
@@ -44,6 +44,6 @@ supports_graded_feedback_requests = false
 url = http://module-modeling-llm:5008
 type = modeling
 supports_evaluation = false
-supports_non_graded_feedback_requests = false
+supports_non_graded_feedback_requests = true
 supports_graded_feedback_requests = true
 
diff --git a/assessment_module_manager/modules.ini b/assessment_module_manager/modules.ini
@@ -44,6 +44,6 @@ supports_graded_feedback_requests = false
 url = http://localhost:5008
 type = modeling
 supports_evaluation = false
-supports_non_graded_feedback_requests = false
+supports_non_graded_feedback_requests = true
 supports_graded_feedback_requests = true
 
diff --git a/modules/modeling/module_modeling_llm/.env.example b/modules/modeling/module_modeling_llm/.env.example
@@ -12,7 +12,7 @@ DATABASE_URL=sqlite:///../data/data.sqlite
 
 # Default model to use
 # See below for options, available models are also logged on startup
-LLM_DEFAULT_MODEL="azure_openai_gpt-35"
+LLM_DEFAULT_MODEL="azure_openai_gpt-4o"
 
 # Enable LLM-as-a-judge approach 0 = disabled, 1 = enabled
 LLM_ENABLE_LLM_AS_A_JUDGE=1
@@ -23,13 +23,13 @@ LLM_EVALUATION_MODEL="azure_openai_gpt-4"
 # Standard OpenAI (Non-Azure) [leave blank if not used]
 # Model names prefixed with `openai_` followed by the model name, e.g. `openai_text-davinci-003`
 # A list of models can be found in `module_text_llm/helpers/models/openai.py` (openai_models)
-LLM_OPENAI_API_KEY="xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
+OPENAI_API_KEY="xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
 
 # Azure OpenAI [leave blank if not used]
 # Model names prefixed with `azure_openai_` followed by the deployment id, e.g. `azure_openai_gpt-35`
-LLM_AZURE_OPENAI_API_KEY="xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
-LLM_AZURE_OPENAI_API_BASE="https://ase-eu01.openai.azure.com/" # change base if needed
-LLM_AZURE_OPENAI_API_VERSION="2023-07-01-preview" # change base if needed
+AZURE_OPENAI_API_KEY="xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
+AZURE_OPENAI_ENDPOINT="https://ase-eu01.openai.azure.com/" # change base if needed
+OPENAI_API_VERSION="2023-07-01-preview" # change base if needed
 
 # Replicate [leave blank if not used]
 # See https://replicate.com and adjust model config options in `module_text_llm/helpers/models/replicate.py`

diff --git a/modules/modeling/module_modeling_llm/module_modeling_llm/__main__.py b/modules/modeling/module_modeling_llm/module_modeling_llm/__main__.py
@@ -31,7 +31,7 @@ def process_incoming_feedback(exercise: Exercise, submission: Submission, feedba
 async def suggest_feedback(exercise: Exercise, submission: Submission, is_graded: bool, module_config: Configuration) -> List[Feedback]:
     logger.info("suggest_feedback: Suggestions for submission %d of exercise %d were requested", submission.id,
                 exercise.id)
-    return await generate_suggestions(exercise, submission, module_config.approach, module_config.debug)
+    return await generate_suggestions(exercise, submission, is_graded, module_config.approach, module_config.debug)
 
 
 if __name__ == "__main__":

diff --git a/modules/modeling/module_modeling_llm/module_modeling_llm/config.py b/modules/modeling/module_modeling_llm/module_modeling_llm/config.py
@@ -3,8 +3,10 @@
 from athena import config_schema_provider
 from module_modeling_llm.helpers.models import ModelConfigType, DefaultModelConfig
 from module_modeling_llm.prompts.generate_suggestions import (
-    system_message as generate_suggestions_system_message,
-    human_message as generate_suggestions_human_message
+    graded_feedback_system_message as default_graded_feedback_system_message,
+    graded_feedback_human_message as default_graded_feedback_human_message,
+    filter_feedback_system_message as default_filter_feedback_system_message,
+    filter_feedback_human_message as default_filter_feedback_human_message
 )
 
 
@@ -16,10 +18,15 @@ class GenerateSuggestionsPrompt(BaseModel):
     _Note: **{problem_statement}**, **{example_solution}**, or **{grading_instructions}** might be omitted if the input
     is too long._
     """
-    system_message: str = Field(default=generate_suggestions_system_message,
-                                description="Message for priming AI behavior and instructing it what to do.")
-    human_message: str = Field(default=generate_suggestions_human_message,
-                               description="Message from a human. The input on which the AI is supposed to act.")
+    graded_feedback_system_message: str = Field(default=default_graded_feedback_system_message,
+        description="Message for priming AI behavior and instructing it what to do.")
+    graded_feedback_human_message: str = Field(default=default_graded_feedback_human_message,
+        description="Message from a human. The input on which the AI is supposed to act.")
+    filter_feedback_system_message: str = Field(default=default_filter_feedback_system_message,
+        description="Message for priming AI behavior for filtering ungraded feedback.")
+    filter_feedback_human_message: str = Field(default=default_filter_feedback_human_message,
+        description="Message for instructing AI to filter ungraded feedback.")
+
 
 
 class BasicApproachConfig(BaseModel):

diff --git a/modules/modeling/module_modeling_llm/module_modeling_llm/generate_suggestions.py b/modules/modeling/module_modeling_llm/module_modeling_llm/generate_suggestions.py
@@ -1,28 +1,23 @@
 import json
 from typing import List, Optional, Sequence
 
+from module_modeling_llm.prompts.apollon_format import apollon_format_description
 from pydantic import BaseModel, Field
 
 from athena import emit_meta
-from athena.logger import logger
 from athena.modeling import Exercise, Submission, Feedback
 from module_modeling_llm.config import BasicApproachConfig
-from module_modeling_llm.helpers.llm_utils import (
-    get_chat_prompt_with_formatting_instructions,
-    check_prompt_length_and_omit_features_if_necessary,
-    num_tokens_from_prompt,
-    predict_and_parse
-)
-from module_modeling_llm.helpers.models.diagram_types import DiagramType
+from langchain_core.output_parsers import PydanticOutputParser
+from langchain_core.prompts import ChatPromptTemplate
+from module_modeling_llm.helpers.llm_utils import predict_and_parse
 from module_modeling_llm.helpers.serializers.diagram_model_serializer import DiagramModelSerializer
-from module_modeling_llm.helpers.utils import format_grading_instructions, get_elements
-from module_modeling_llm.prompts.submission_format.submission_format_remarks import get_submission_format_remarks
+from module_modeling_llm.helpers.utils import format_grading_instructions
 
 
 class FeedbackModel(BaseModel):
     title: str = Field(description="Very short title, i.e. feedback category or similar", example="Logic Error")
     description: str = Field(description="Feedback description")
-    element_ids: Optional[str] = Field(description="Referenced diagram element IDs, or empty if unreferenced")
+    element_names: Optional[List[str]] = Field(description="Referenced diagram element names, and relations (R<number>) or empty if unreferenced")
     credits: float = Field(0.0, description="Number of points received/deducted")
     grading_instruction_id: Optional[int] = Field(
         description="ID of the grading instruction that was used to generate this feedback, or empty if no grading instruction was used"
@@ -35,25 +30,13 @@ class Config:
 class AssessmentModel(BaseModel):
     """Collection of feedbacks making up an assessment"""
 
-    feedbacks: Sequence[FeedbackModel] = Field(description="Assessment feedbacks")
+    feedbacks: Sequence[FeedbackModel] = Field(description="Assessment feedbacks, make sure to include all grading instructions")
 
     class Config:
         title = "Assessment"
 
 
-def filter_ids_for_model(ids: List[str], model: dict) -> List[str]:
-    """
-    Filter a list of element ids based on whether a corresponding element is present in a given diagram model.
-    :param ids: List of ids that should be filtered
-    :param model: Diagram model in which elements with the given ids should be contained
-    :return The filtered list of IDs
-    """
-    elements: list[dict] = get_elements(model)
-    model_ids: set[str] = {str(element.get("id")) for element in elements}
-    return list(filter(lambda id: id in model_ids, ids))
-
-
-async def generate_suggestions(exercise: Exercise, submission: Submission, config: BasicApproachConfig, debug: bool) -> \
+async def generate_suggestions(exercise: Exercise, submission: Submission, is_graded: bool, config: BasicApproachConfig, debug: bool) -> \
         List[Feedback]:
     """
     Generate feedback suggestions for modeling exercise submissions
@@ -65,56 +48,31 @@ async def generate_suggestions(exercise: Exercise, submission: Submission, confi
     """
     model = config.model.get_model()  # type: ignore[attr-defined]
 
-    serialized_example_solution = None
+    print("Model ", model)
 
+    serialized_example_solution = None
     if exercise.example_solution:
         example_solution_diagram = json.loads(exercise.example_solution)
         serialized_example_solution, _ = DiagramModelSerializer.serialize_model(example_solution_diagram)
 
     submission_diagram = json.loads(submission.model)
-    submission_format_remarks = get_submission_format_remarks(submission_diagram.get("type"))
-
-    # Having the LLM reference IDs that a specific feedback item applies to seems to work a lot more reliable with
-    # shorter IDs, especially if they are prefixed with "id_". We therefore map the UUIDs used in Apollon diagrams to
-    # shortened IDs and have the diagram model serializer return a reverse mapping dictionary which allows us to map
-    # the shortened IDs back to the original ones.
-    serialized_submission, reverse_id_map = DiagramModelSerializer.serialize_model(submission_diagram)
+    serialized_submission, element_id_mapping = DiagramModelSerializer.serialize_model(submission_diagram)
 
     prompt_input = {
         "max_points": exercise.max_points,
         "bonus_points": exercise.bonus_points,
         "grading_instructions": format_grading_instructions(exercise.grading_instructions, exercise.grading_criteria),
-        "submission_format_remarks": submission_format_remarks,
+        "submission_format": submission_diagram.get("type"),
         "problem_statement": exercise.problem_statement or "No problem statement.",
         "example_solution": serialized_example_solution or "No example solution.",
-        "submission": serialized_submission
+        "submission": serialized_submission,
+        "uml_diagram_format": apollon_format_description,
+        "format_instructions": PydanticOutputParser(pydantic_object=AssessmentModel).get_format_instructions()
     }
 
-    chat_prompt = get_chat_prompt_with_formatting_instructions(
-        model=model,
-        system_message=config.generate_suggestions_prompt.system_message,
-        human_message=config.generate_suggestions_prompt.human_message,
-        pydantic_object=AssessmentModel
-    )
-
-    # Check if the prompt is too long and omit features if necessary (in order of importance)
-    omittable_features = ["example_solution", "problem_statement", "grading_instructions"]
-    prompt_input, should_run = check_prompt_length_and_omit_features_if_necessary(
-        prompt=chat_prompt,
-        prompt_input=prompt_input,
-        max_input_tokens=10000,  # config.max_input_tokens,
-        omittable_features=omittable_features,
-        debug=debug
-    )
-
-    # Skip if the prompt is too long
-    if not should_run:
-        logger.warning("Input too long. Skipping.")
-        if debug:
-            emit_meta("prompt", chat_prompt.format(**prompt_input))
-            emit_meta("error",
-                      f"Input too long {num_tokens_from_prompt(chat_prompt, prompt_input)} > {config.max_input_tokens}")
-        return []
+    chat_prompt =  ChatPromptTemplate.from_messages([
+        ("system", config.generate_suggestions_prompt.graded_feedback_system_message),
+        ("human", config.generate_suggestions_prompt.graded_feedback_human_message)])
 
     result = await predict_and_parse(
         model=model,
@@ -136,6 +94,40 @@ async def generate_suggestions(exercise: Exercise, submission: Submission, confi
     if result is None:
         return []
 
+    # Check if is graded
+    if is_graded is False:
+        filter_chat_prompt = ChatPromptTemplate.from_messages([
+            ("system", config.generate_suggestions_prompt.filter_feedback_system_message),
+            ("human", config.generate_suggestions_prompt.filter_feedback_human_message)
+        ])
+
+        filter_prompt_input = {
+            "original_feedback": result.dict(),
+            "format_instructions": PydanticOutputParser(pydantic_object=AssessmentModel).get_format_instructions()
+        }
+
+        print("Filter prompt input", filter_prompt_input)
+
+        result = await predict_and_parse(
+            model=model,
+            chat_prompt=filter_chat_prompt,
+            prompt_input=filter_prompt_input,
+            pydantic_object=AssessmentModel,
+            tags=[
+                f"exercise-{exercise.id}-filter",
+                f"submission-{submission.id}-filter",
+            ]
+        )
+
+        if debug:
+            emit_meta("filter_feedback", {
+                "prompt": filter_chat_prompt.format(**filter_prompt_input),
+                "result": result.dict() if result is not None else None
+            })
+
+        if result is None:
+            return []
+
     grading_instruction_ids = set(
         grading_instruction.id
         for criterion in exercise.grading_criteria or []
@@ -145,21 +137,20 @@ async def generate_suggestions(exercise: Exercise, submission: Submission, confi
     feedbacks = []
     for feedback in result.feedbacks:
         grading_instruction_id = feedback.grading_instruction_id if feedback.grading_instruction_id in grading_instruction_ids else None
-        element_ids = list(
-            map(lambda element_id: reverse_id_map[
-                element_id.strip()
-            ] if reverse_id_map else element_id.strip(), feedback.element_ids.split(","))
-        ) if feedback.element_ids else []
+        element_ids = [element_id_mapping[element] for element in (feedback.element_names or [])]
+
 
         feedbacks.append(Feedback(
             exercise_id=exercise.id,
             submission_id=submission.id,
             title=feedback.title,
             description=feedback.description,
-            element_ids=filter_ids_for_model(element_ids, submission_diagram),
+            element_ids=element_ids,
             credits=feedback.credits,
             structured_grading_instruction_id=grading_instruction_id,
-            meta={}
+            meta={},
+            id=None,
+            is_graded=False
         ))
 
-    return feedbacks
+    return feedbacks