From 06379826668f9bcf3fcc826d463a2ef0c687af1f Mon Sep 17 00:00:00 2001
From: arkadyark-cohere <arkady@cohere.com>
Date: Fri, 12 Apr 2024 16:42:38 +0000
Subject: [PATCH 1/2] Remove blobfile dep, load directly from URL

---
 drop_eval.py      | 7 ++++---
 gpqa_eval.py      | 5 +----
 humaneval_eval.py | 1 -
 math_eval.py      | 3 +--
 mgsm_eval.py      | 7 +++----
 mmlu_eval.py      | 3 +--
 6 files changed, 10 insertions(+), 16 deletions(-)

diff --git a/drop_eval.py b/drop_eval.py
index b66b5d2..7f97f5f 100644
--- a/drop_eval.py
+++ b/drop_eval.py
@@ -10,8 +10,8 @@
 import re
 import string
 from typing import Any, Dict, List, Optional, Set, Tuple, Union
+import urllib
 
-import blobfile as bf
 import numpy as np
 from scipy.optimize import linear_sum_assignment
 
@@ -247,9 +247,10 @@ def __init__(self, num_examples: int | None = None, train_samples_per_prompt: in
         self.test_jsonl = (
             "https://openaipublic.blob.core.windows.net/simple-evals/drop_v0_dev.jsonl.gz"
         )
-        with gzip.GzipFile(fileobj=bf.BlobFile(self.train_jsonl, "rb"), mode="rb") as f:
+        breakpoint()
+        with gzip.GzipFile(fileobj=urllib.request.urlopen(self.train_jsonl), mode="rb") as f:
             self.train_samples = list(map(json.loads, f.readlines()))
-        with gzip.GzipFile(fileobj=bf.BlobFile(self.test_jsonl, "rb"), mode="rb") as f:
+        with gzip.GzipFile(fileobj=urllib.request.urlopen(self.test_jsonl), mode="rb") as f:
             self.test_samples = list(map(json.loads, f.readlines()))
             if self._num_examples:
                 self.test_samples = random.Random(self.seed).sample(
diff --git a/gpqa_eval.py b/gpqa_eval.py
index 3eb2a73..5ffbf1f 100644
--- a/gpqa_eval.py
+++ b/gpqa_eval.py
@@ -7,7 +7,6 @@
 import random
 import re
 
-import blobfile as bf
 import pandas
 
 from . import common
@@ -27,9 +26,7 @@ def __init__(
         num_examples: int | None = None,  # restrict to a subset of the data for debugging
     ):
         df = pandas.read_csv(
-            bf.BlobFile(
-                f"https://openaipublic.blob.core.windows.net/simple-evals/gpqa_{variant}.csv"
-            )
+            f"https://openaipublic.blob.core.windows.net/simple-evals/gpqa_{variant}.csv"
         )
         examples = [row.to_dict() for _, row in df.iterrows()]
         rng = random.Random(0)
diff --git a/humaneval_eval.py b/humaneval_eval.py
index fe3f8fe..9f0db97 100644
--- a/humaneval_eval.py
+++ b/humaneval_eval.py
@@ -14,7 +14,6 @@
 from io import BytesIO
 from typing import Any, Tuple
 
-import blobfile as bf
 import tqdm
 from human_eval.data import HUMAN_EVAL, read_problems
 from human_eval.evaluation import estimate_pass_at_k
diff --git a/math_eval.py b/math_eval.py
index e0948ce..5acf10c 100644
--- a/math_eval.py
+++ b/math_eval.py
@@ -7,7 +7,6 @@
 import random
 import re
 
-import blobfile as bf
 import pandas
 
 from . import common
@@ -92,7 +91,7 @@ def check_equality(sampler: SamplerBase, expr1: str, expr2: str):
 
 class MathEval(Eval):
     def __init__(self, equality_checker: SamplerBase, num_examples: int | None = None):
-        df = pandas.read_csv(bf.BlobFile("https://openaipublic.blob.core.windows.net/simple-evals/math_test.csv"))
+        df = pandas.read_csv("https://openaipublic.blob.core.windows.net/simple-evals/math_test.csv")
         examples = [row.to_dict() for _, row in df.iterrows()]
         if num_examples:
             examples = random.Random(0).sample(examples, num_examples)
diff --git a/mgsm_eval.py b/mgsm_eval.py
index 6309747..27734a8 100644
--- a/mgsm_eval.py
+++ b/mgsm_eval.py
@@ -7,8 +7,7 @@
 
 import re
 from typing import Optional
-
-import blobfile as bf
+import urllib
 
 from . import common
 from .mmlu_eval import HTML_JINJA
@@ -109,8 +108,8 @@ def score_mgsm(target: str, prediction: str) -> bool:
 def get_lang_examples(lang: str) -> list[dict[str, str]]:
     fpath = LANG_TO_FPATH[lang]
     examples = []
-    with bf.BlobFile(fpath, "r") as f:
-        for line in f:
+    with urllib.request.urlopen(fpath) as f:
+        for line in f.read().decode("utf-8").splitlines():
             inputs, targets = line.strip().split("\t")
             if "." in targets:
                 raise ValueError(f"targets {targets} contains a decimal point.")
diff --git a/mmlu_eval.py b/mmlu_eval.py
index f64f480..702d56b 100644
--- a/mmlu_eval.py
+++ b/mmlu_eval.py
@@ -7,7 +7,6 @@
 import random
 import re
 
-import blobfile as bf
 import pandas
 
 from . import common
@@ -95,7 +94,7 @@ def format_question(row):
 class MMLUEval(Eval):
     def __init__(self, num_examples: int | None = None):
         df = pandas.read_csv(
-            bf.BlobFile("https://openaipublic.blob.core.windows.net/simple-evals/mmlu.csv")
+            "https://openaipublic.blob.core.windows.net/simple-evals/mmlu.csv"
         )
         examples = [row.to_dict() for _, row in df.iterrows()]
         if num_examples:

From 4b86af888714a6f4deb63eadaea274748bbe8a28 Mon Sep 17 00:00:00 2001
From: arkadyark-cohere <arkady@cohere.com>
Date: Fri, 12 Apr 2024 16:59:31 +0000
Subject: [PATCH 2/2] Remove breakpoint

---
 drop_eval.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/drop_eval.py b/drop_eval.py
index 7f97f5f..a63521c 100644
--- a/drop_eval.py
+++ b/drop_eval.py
@@ -247,7 +247,6 @@ def __init__(self, num_examples: int | None = None, train_samples_per_prompt: in
         self.test_jsonl = (
             "https://openaipublic.blob.core.windows.net/simple-evals/drop_v0_dev.jsonl.gz"
         )
-        breakpoint()
         with gzip.GzipFile(fileobj=urllib.request.urlopen(self.train_jsonl), mode="rb") as f:
             self.train_samples = list(map(json.loads, f.readlines()))
         with gzip.GzipFile(fileobj=urllib.request.urlopen(self.test_jsonl), mode="rb") as f: