Sandbox URL Creation (#8)

Co-authored-by: pixeebot[bot] <104101892+pixeebot[bot]@users.noreply.github.com>
LAION-AI · May 7, 2024 · 8fd7727 · 8fd7727
1 parent 0d7e3ee
commit 8fd7727
Show file tree

Hide file tree

Showing 12 changed files with 27 additions and 24 deletions.
diff --git a/data/datasets/biostars_qa/get_biostars_dataset.py b/data/datasets/biostars_qa/get_biostars_dataset.py
@@ -4,8 +4,8 @@
 import time
 
 import pandas as pd
-import requests
 from tqdm import tqdm
+from security import safe_requests
 
 
 def get_biostars_dataset(start_idx=9557161, accept_threshold=1000000, sleep=0.1, folder="biostars"):
@@ -41,7 +41,7 @@ def get_biostars_dataset(start_idx=9557161, accept_threshold=1000000, sleep=0.1,
             print(f"MSG: {file} exists. Skipping; Current accepted: {has_accepted_count}")
             continue
 
-        r = requests.get(url, headers=headers, timeout=60)
+        r = safe_requests.get(url, headers=headers, timeout=60)
 
         # print(r.status_code, r.reason)
 

diff --git a/data/datasets/oa_dolly_15k/create_dataset.py b/data/datasets/oa_dolly_15k/create_dataset.py
@@ -1,15 +1,14 @@
 import json
 from pathlib import Path
-
-import requests
 from datasets import Dataset
+from security import safe_requests
 
 DATA_URL = "https://raw.githubusercontent.com/databrickslabs/dolly/master/data/databricks-dolly-15k.jsonl"
 FILE_PATH = "databricks_dolly_15k.jsonl"
 
 
 def download_data(url: str, destination: str):
-    response = requests.get(url, stream=True, timeout=60)
+    response = safe_requests.get(url, stream=True, timeout=60)
 
     with open(destination, "wb") as handle:
         for data in response.iter_content():

diff --git a/data/datasets/oa_stackexchange/download.py b/data/datasets/oa_stackexchange/download.py
@@ -14,6 +14,7 @@
 
 import requests
 from bs4 import BeautifulSoup as bs
+from security import safe_requests
 
 BASE_URL = "https://ia600107.us.archive.org/view_archive.php?archive=/27/items/stackexchange/{0}&file=Posts.xml"
 DOWNLOAD_DIR = "xml/"
@@ -50,7 +51,7 @@ def download_url(dataset_name: str, url: str):
         return cache_path
     else:
         print("Downloading xml: ", dataset_name)
-        response = requests.get(url, timeout=60)
+        response = safe_requests.get(url, timeout=60)
         print("Finished downloading: ", dataset_name)
         with open(cache_path, "wb") as f:
             f.write(response.content)

diff --git a/data/datasets/youtube_subs_howto100M/prepare.py b/data/datasets/youtube_subs_howto100M/prepare.py
@@ -8,11 +8,10 @@
 import urllib
 import zipfile
 from typing import List
-
-import requests
 from tqdm import tqdm
 from youtube_transcript_api import YouTubeTranscriptApi
 import secrets
+from security import safe_requests
 
 
 def get_video_ids(raw_file: str, video_id_pattern: str) -> List[str]:
@@ -74,7 +73,7 @@ def main(output_dir: str = "data"):
         print("Downloading HowTo100M raw_caption.zip...")
         print(" might take some time(3.4G)...")
         url = "https://www.rocq.inria.fr/cluster-willow/amiech/howto100m/raw_caption.zip"
-        response = requests.get(url, timeout=60)
+        response = safe_requests.get(url, timeout=60)
         zipped = zipfile.ZipFile(io.BytesIO(response.content))
         zipped.extractall("./temp")
 

diff --git a/data/datasets/zhihu-kol/main.py b/data/datasets/zhihu-kol/main.py
@@ -4,10 +4,10 @@
 
 import multitasking
 import pandas as pd
-import requests
 from bs4 import BeautifulSoup
 from retry import retry
 from tqdm import tqdm
+from security import safe_requests
 
 
 def get_uid_by_url_token(url_token: str) -> str:
@@ -41,7 +41,7 @@ def get_uid_by_url_token(url_token: str) -> str:
     }
 
     url = "https://api.zhihu.com/people/" + url_token
-    response = requests.get(url, headers=headers, timeout=60)
+    response = safe_requests.get(url, headers=headers, timeout=60)
     uid = response.json()["id"]
     return uid
 
@@ -100,7 +100,7 @@ def get_user_answers(url_token: str, max_count: int = 100000) -> pd.DataFrame:
             ("offset", f"{offset}"),
         )
 
-        response = requests.get(url, headers=headers, params=params, timeout=60)
+        response = safe_requests.get(url, headers=headers, params=params, timeout=60)
 
         if response.json().get("paging") is None:
             return pd.DataFrame(columns=operations.keys())
@@ -148,7 +148,7 @@ def get_answer_content(qid: str, aid) -> str:
         "Host": "www.zhihu.com",
     }
     url = f"https://www.zhihu.com/question/{qid}/answer/{aid}"
-    response = requests.get(url, headers=headers, timeout=60)
+    response = safe_requests.get(url, headers=headers, timeout=60)
 
     soup = BeautifulSoup(response.text, "html.parser")
     content = " ".join([p.text.strip() for p in soup.find_all("p")])

diff --git a/data/datasets/zhihu-kol/scrape_by_topic.py b/data/datasets/zhihu-kol/scrape_by_topic.py
@@ -8,11 +8,11 @@
 
 import numpy as np
 import pandas as pd
-import requests
 from bs4 import BeautifulSoup
 from loguru import logger
 from playwright.sync_api import Locator, Page, sync_playwright
 from tqdm import tqdm
+from security import safe_requests
 
 
 @dataclass
@@ -46,7 +46,7 @@ def get_answer_content(qid: int, aid: int, question_str: str) -> str:
         "Host": "www.zhihu.com",
     }
     url = f"https://www.zhihu.com/question/{qid}/answer/{aid}"
-    response = requests.get(url, headers=headers, timeout=60)
+    response = safe_requests.get(url, headers=headers, timeout=60)
 
     soup = BeautifulSoup(response.text, "html.parser")
     content = " ".join([p.text.strip() for p in soup.find_all("p")])

diff --git a/inference/worker/chat_chain_utils.py b/inference/worker/chat_chain_utils.py
@@ -14,6 +14,7 @@
 from openapi_parser import prepare_plugin_for_llm
 from settings import settings
 from utils import shared_tokenizer_lock, special_tokens
+from security import safe_requests
 
 RESPONSE_MAX_LENGTH = 2048
 DESCRIPTION_FOR_MODEL_MAX_LENGTH = 512
@@ -205,7 +206,7 @@ def run_request(self, params: str, url: str, param_location: str, type: str, pay
                 logger.info(
                     f"Running {type.upper()} request on {url} with\nparams: {params}\nparam_location: {param_location}\npayload: {payload}"
                 )
-                res = requests.get(url, params=query_params, headers=headers, timeout=60)
+                res = safe_requests.get(url, params=query_params, headers=headers, timeout=60)
             elif type.lower() == "post":
                 # if model did not generate payload object, use params as payload
                 data = json.dumps(payload) if payload else json.dumps(params)

diff --git a/inference/worker/openapi_parser.py b/inference/worker/openapi_parser.py
@@ -5,10 +5,11 @@
 import yaml
 from loguru import logger
 from oasst_shared.schemas import inference
+from security import safe_requests
 
 
 def fetch_openapi_spec(url):
-    response = requests.get(url, timeout=60)
+    response = safe_requests.get(url, timeout=60)
     if response.status_code != 200:
         raise Exception(f"Failed to fetch data from URL: {url}. Status code: {response.status_code}")
 
@@ -29,7 +30,7 @@ def fetch_openapi_spec(url):
 
 def get_plugin_config(url: str) -> inference.PluginConfig | None:
     try:
-        response = requests.get(url, timeout=60)
+        response = safe_requests.get(url, timeout=60)
         response.raise_for_status()
         plugin_dict = response.json()
         logger.info(f"Plugin config downloaded {plugin_dict}")

diff --git a/inference/worker/utils.py b/inference/worker/utils.py
@@ -14,6 +14,7 @@
 from oasst_shared.schemas import inference
 from settings import settings
 import secrets
+from security import safe_requests
 
 shared_tokenizer_lock = threading.Lock()
 
@@ -258,7 +259,7 @@ def _maybe_add_bearer_token(self, headers: dict[str, str] | None):
 
     def get(self, path: str, **kwargs):
         kwargs["headers"] = self._maybe_add_bearer_token(kwargs.get("headers"))
-        return requests.get(self.base_url + path, auth=self.auth, **kwargs, timeout=60)
+        return safe_requests.get(self.base_url + path, auth=self.auth, **kwargs, timeout=60)
 
     def post(self, path: str, **kwargs):
         kwargs["headers"] = self._maybe_add_bearer_token(kwargs.get("headers"))

diff --git a/model/model_training/custom_datasets/prompt_dialogue.py b/model/model_training/custom_datasets/prompt_dialogue.py
@@ -3,14 +3,13 @@
 import re
 from pathlib import Path
 from typing import List, Mapping, Optional, Sequence, Union
-
-import requests
 from datasets import load_dataset
 from model_training.custom_datasets.formatting import DatasetEntrySft, Role, Utterance
 from model_training.custom_datasets.oasst_dataset import ListDataset
 from model_training.custom_datasets.utils import _filter_by_words
 from torch import Generator, randperm
 from torch.utils.data import Dataset, random_split
+from security import safe_requests
 
 
 def load_oig_file(
@@ -34,7 +33,7 @@ def load_oig_file(
     # download file if not cached
     if not local_path.exists() or local_path.stat().st_size == 0 or no_cache:
         print(f"downloading {source_url} to {local_path}")
-        r = requests.get(source_url, stream=True, timeout=60)
+        r = safe_requests.get(source_url, stream=True, timeout=60)
         with local_path.open(mode="wb") as fd:
             for chunk in r.iter_content(chunk_size=1024 * 1024):
                 fd.write(chunk)

diff --git a/oasst-shared/pyproject.toml b/oasst-shared/pyproject.toml
@@ -11,7 +11,8 @@ dependencies = [
     "aiohttp[speedups]",
     "loguru==0.6.0",
     "psutil==5.9.4",
-    "pynvml==11.5.0"
+    "pynvml==11.5.0",
+    "security==1.2.1"
 ]
 
 [project.optional-dependencies]

diff --git a/scripts/data_augment/data_augment.py b/scripts/data_augment/data_augment.py
@@ -26,6 +26,7 @@
 from syntax.syntax_injector import SyntaxBug
 from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, T5ForConditionalGeneration, pipeline
 import secrets
+from security import safe_requests
 
 
 class DataAugmenter:
@@ -245,7 +246,7 @@ def parse(self, _):
 
         xml_posts_path = urls.get(dataset_name)
 
-        response = requests.get(xml_posts_path, timeout=60)
+        response = safe_requests.get(xml_posts_path, timeout=60)
         df = self.xml_to_df(response)
         df = self.filter(df)