ServiceNow · xhluca · Oct 16, 2024 · Oct 16, 2024 · Oct 16, 2024
diff --git a/.github/workflows/unit_tests.yml b/.github/workflows/unit_tests.yml
@@ -9,7 +9,7 @@ on:
 jobs:
 
  agentlab:
- runs-on: ubuntu-latest
+ runs-on: ubuntu-22.04
 
  defaults:
  run:

diff --git a/reproducibility_journal.csv b/reproducibility_journal.csv
@@ -9,3 +9,4 @@ recursix,GenericAgent-gpt-4o-mini-2024-07-18,miniwob,0.6.3,2024-10-01_11-45-23,0
 recursix,GenericAgent-gpt-4o-mini-2024-07-18,workarena.l1,0.3.2,2024-10-05_13-21-27,0.23,0.023,0,330/330,None,Linux (#66-Ubuntu SMP Fri Aug 30 13:56:20 UTC 2024),3.12.6,1.39.0,0.2.1,aadf86b397cd36c581e1a61e491aec649ac5a140, M: main.py,0.7.0,2a0ab7e8e8795f8ca35fe4d4d67c6892d635dc12,
 recursix,GenericAgent-gpt-4o-2024-05-13,workarena.l1,0.3.2,2024-10-05_15-45-42,0.382,0.027,0,330/330,None,Linux (#66-Ubuntu SMP Fri Aug 30 13:56:20 UTC 2024),3.12.6,1.39.0,0.2.1,ab447e997af589bbd022de7a5189a7685ddfa6ef,,0.7.0,2a0ab7e8e8795f8ca35fe4d4d67c6892d635dc12,
 recursix,GenericAgent-meta-llama_llama-3.1-70b-instruct,miniwob_tiny_test,0.7.0,2024-10-05_17-49-15,1.0,0.0,0,4/4,None,Linux (#66-Ubuntu SMP Fri Aug 30 13:56:20 UTC 2024),3.12.6,1.39.0,0.2.1,a98fa24426a6ddde8443e8be44ed94cd9522e5ca,,0.7.0,2a0ab7e8e8795f8ca35fe4d4d67c6892d635dc12,
+recursix,GenericAgent-meta-llama_llama-3-70b-instruct,workarena.l1,0.3.2,2024-10-09_21-16-37,0.176,0.021,0,330/330,None,Linux (#66-Ubuntu SMP Fri Aug 30 13:56:20 UTC 2024),3.12.6,1.39.0,0.2.1,c847dbd334184271b32b252409a1b6c1042d7442,,0.7.0,2a0ab7e8e8795f8ca35fe4d4d67c6892d635dc12,
diff --git a/src/agentlab/__init__.py b/src/agentlab/__init__.py
@@ -1 +1 @@
-__version__ = "0.2.1"
+__version__ = "0.2.2"
diff --git a/src/agentlab/agents/generic_agent/reproducibility_agent.py b/src/agentlab/agents/generic_agent/reproducibility_agent.py
@@ -43,7 +43,7 @@ def __init__(self, old_messages, delay=1) -> None:
  self.old_messages = old_messages
  self.delay = delay
 
- def invoke(self, messages: list):
+ def __call__(self, messages: list):
  self.new_messages = copy(messages)
 
  if len(messages) >= len(self.old_messages):
@@ -56,6 +56,9 @@ def invoke(self, messages: list):
  # return the next message in the list
  return old_response
 
+ def get_stats(self):
+ return {}
+
 
 @dataclass
 class ReproAgentArgs(GenericAgentArgs):
@@ -102,6 +105,14 @@ def get_action(self, obs):
  )
  return None, agent_info
 
+ # an old bug prevented the response from being saved.
+ if len(old_chat_messages) == 2:
+ recorded_action = step_info.action
+ if recorded_action:
+ # Recreate the 3rd message based on the recorded action
+ assistant_message = make_assistant_message(f"<action>{recorded_action}</action>")
+ old_chat_messages.append(assistant_message)
+
  self.chat_llm = ReproChatModel(old_chat_messages)
  action, agent_info = super().get_action(obs)
 
@@ -128,27 +139,28 @@ def _format_messages(messages: list[dict]):
  return "\n".join(f"{m['role']} message:\n{m['content']}\n" for m in messages)
 
 
-def reproduce_study(original_study_dir: Path | str):
+def reproduce_study(original_study_dir: Path | str, log_level=logging.INFO):
  """Reproduce a study by running the same experiments with the same agent."""
 
  original_study_dir = Path(original_study_dir)
 
  study_name = f"reproducibility_of_{original_study_dir.name}"
 
- exp_args_list = []
+ exp_args_list: list[ExpArgs] = []
  for exp_result in yield_all_exp_results(original_study_dir, progress_fn=None):
  agent_args = make_repro_agent(exp_result.exp_args.agent_args, exp_dir=exp_result.exp_dir)
  exp_args_list.append(
  ExpArgs(
  agent_args=agent_args,
  env_args=exp_result.exp_args.env_args,
- logging_level=logging.DEBUG,
+ logging_level=log_level,
  )
  )
+ benchmark_name = exp_args_list[0].env_args.task_name.split(".")[0]
 
  return Study(
  exp_args_list=exp_args_list,
- benchmark_name="repro_study",
+ benchmark_name=benchmark_name,
  agent_names=[agent_args.agent_name],
  )
 

diff --git a/src/agentlab/analyze/agent_xray.py b/src/agentlab/analyze/agent_xray.py
@@ -482,7 +482,9 @@ def run_gradio(results_dir: Path):
  tabs.select(tab_select)
 
  demo.queue()
- demo.launch(server_port=int(os.getenv("AGENTXRAY_APP_PORT", 7899)), share=True)
+
+ do_share = os.getenv("AGENTXRAY_SHARE_GRADIO", 'false').lower() == 'true'
+ demo.launch(server_port=int(os.getenv("AGENTXRAY_APP_PORT", "7899")), share=do_share)
 
 
 def tab_select(evt: gr.SelectData):

diff --git a/src/agentlab/analyze/inspect_results.py b/src/agentlab/analyze/inspect_results.py
@@ -247,6 +247,7 @@ def get_std_err(df, metric):
  std_err = np.sqrt(mean * (1 - mean) / len(data))
  else:
  return get_sample_std_err(df, metric)
+
  return mean, std_err
 
 
@@ -258,7 +259,7 @@ def get_sample_std_err(df, metric):
  mean = np.mean(data)
  std_err = np.std(data, ddof=1) / np.sqrt(len(data))
  if np.isnan(std_err):
- std_err = 0
+ std_err = np.zeros_like(std_err)
  return mean, std_err
 
 
@@ -289,7 +290,7 @@ def summarize(sub_df, use_bootstrap=False):
 
  record = dict(
  avg_reward=sub_df["cum_reward"].mean(skipna=True).round(3),
- std_err=std_reward.round(3),
+ std_err=std_reward.astype(float).round(3),
  # avg_raw_reward=sub_df["cum_raw_reward"].mean(skipna=True).round(3),
  avg_steps=sub_df["n_steps"].mean(skipna=True).round(3),
  n_completed=f"{n_completed}/{len(sub_df)}",

diff --git a/src/agentlab/experiments/reproduce_study.py b/src/agentlab/experiments/reproduce_study.py
@@ -5,18 +5,14 @@
 the diff in HTML format.
 """
 
-import logging
-
 from agentlab.agents.generic_agent.reproducibility_agent import reproduce_study
 from agentlab.experiments.exp_utils import RESULTS_DIR
 
-logging.getLogger().setLevel(logging.INFO)
-
 
 if __name__ == "__main__":
 
- old_study = "2024-06-02_18-16-17_final_run"
- # old_study = "2024-09-12_08-39-16_GenericAgent-gpt-4o-mini_on_miniwob_tiny_test"
+ # old_study = "2024-06-03_13-53-50_final_run_workarena_L1_llama3-70b"
+ old_study = "2024-06-03_12-28-51_final_run_miniwob_llama3-70b"
 
  study = reproduce_study(RESULTS_DIR / old_study)
  n_jobs = 1

diff --git a/src/agentlab/experiments/study_generators.py b/src/agentlab/experiments/study_generators.py
@@ -153,7 +153,10 @@ def set_demo_mode(env_args_list: list[EnvArgs]):
 
 
 def run_agents_on_benchmark(
- agents: list[AgentArgs] | AgentArgs = AGENT_4o_MINI, benchmark: str = "miniwob", demo_mode=False
+ agents: list[AgentArgs] | AgentArgs = AGENT_4o_MINI,
+ benchmark: str = "miniwob",
+ demo_mode=False,
+ log_level=logging.INFO,
 ):
  """Run one or multiple agents on a benchmark.
 
@@ -190,7 +193,7 @@ def run_agents_on_benchmark(
  ExpArgs(
  agent_args=args.CrossProd(agents),
  env_args=args.CrossProd(env_args_list),
- logging_level=logging.DEBUG,
+ logging_level=log_level,
  )
  )
 

diff --git a/src/agentlab/llm/base_api.py b/src/agentlab/llm/base_api.py
@@ -0,0 +1,33 @@
+from abc import ABC, abstractmethod
+from dataclasses import dataclass
+
+
+class AbstractChatModel(ABC):
+ @abstractmethod
+ def __call__(self, messages: list[dict]) -> dict:
+ pass
+
+ def get_stats(self):
+ return {}
+
+
+@dataclass
+class BaseModelArgs(ABC):
+ """Base class for all model arguments."""
+
+ model_name: str
+ max_total_tokens: int = None
+ max_input_tokens: int = None
+ max_new_tokens: int = None
+ temperature: float = 0.1
+ vision_support: bool = False
+
+ @abstractmethod
+ def make_model(self) -> AbstractChatModel:
+ pass
+
+ def prepare_server(self):
+ pass
+
+ def close_server(self):
+ pass
diff --git a/src/agentlab/llm/chat_api.py b/src/agentlab/llm/chat_api.py
@@ -2,14 +2,17 @@
 import os
 import re
 import time
-from abc import ABC, abstractmethod
 from dataclasses import dataclass
+from functools import partial
+from typing import Optional
 
 import openai
+from huggingface_hub import InferenceClient
 from openai import AzureOpenAI, OpenAI
 
 import agentlab.llm.tracking as tracking
-from agentlab.llm.huggingface_utils import HuggingFaceURLChatModel
+from agentlab.llm.base_api import AbstractChatModel, BaseModelArgs
+from agentlab.llm.huggingface_utils import HFBaseChatModel
 
 
 def make_system_message(content: str) -> dict:
@@ -24,10 +27,10 @@ def make_assistant_message(content: str) -> dict:
  return dict(role="assistant", content=content)
 
 
-class CheatMiniWoBLLM:
+class CheatMiniWoBLLM(AbstractChatModel):
  """For unit-testing purposes only. It only work with miniwob.click-test task."""
 
- def invoke(self, messages) -> str:
+ def __call__(self, messages) -> str:
  prompt = messages[-1]["content"]
  match = re.search(r"^\s*\[(\d+)\].*button", prompt, re.MULTILINE | re.IGNORECASE)
 
@@ -44,12 +47,6 @@ def invoke(self, messages) -> str:
 """
  return make_assistant_message(answer)
 
- def __call__(self, messages) -> str:
- return self.invoke(messages)
-
- def get_stats(self):
- return {}
-
 
 @dataclass
 class CheatMiniWoBLLMArgs:
@@ -68,28 +65,6 @@ def close_server(self):
  pass
 
 
-@dataclass
-class BaseModelArgs(ABC):
- """Base class for all model arguments."""
-
- model_name: str
- max_total_tokens: int = None
- max_input_tokens: int = None
- max_new_tokens: int = None
- temperature: float = 0.1
- vision_support: bool = False
-
- @abstractmethod
- def make_model(self) -> "ChatModel":
- pass
-
- def prepare_server(self):
- pass
-
- def close_server(self):
- pass
-
-
 @dataclass
 class OpenRouterModelArgs(BaseModelArgs):
  """Serializable object for instantiating a generic chat model with an OpenAI
@@ -221,7 +196,7 @@ def handle_error(error, itr, min_retry_wait_time, max_retry):
  return error_type
 
 
-class ChatModel:
+class ChatModel(AbstractChatModel):
  def __init__(
  self,
  model_name,
@@ -310,9 +285,6 @@ def __call__(self, messages: list[dict]) -> dict:
 
  return make_assistant_message(completion.choices[0].message.content)
 
- def invoke(self, messages: list[dict]) -> dict:
- return self(messages)
-
  def get_stats(self):
  return {
  "n_retry_llm": self.retries,
@@ -401,3 +373,26 @@ def __init__(
  client_args=client_args,
  pricing_func=tracking.get_pricing_openai,
  )
+
+
+class HuggingFaceURLChatModel(HFBaseChatModel):
+ def __init__(
+ self,
+ model_name: str,
+ model_url: str,
+ token: Optional[str] = None,
+ temperature: Optional[int] = 1e-1,
+ max_new_tokens: Optional[int] = 512,
+ n_retry_server: Optional[int] = 4,
+ ):
+ super().__init__(model_name, n_retry_server)
+ if temperature < 1e-3:
+ logging.warning("Models might behave weirdly when temperature is too low.")
+
+ if token is None:
+ token = os.environ["TGI_TOKEN"]
+
+ client = InferenceClient(model=model_url, token=token)
+ self.llm = partial(
+ client.text_generation, temperature=temperature, max_new_tokens=max_new_tokens
+ )