From 8fd7727fa8e739a332481ca4271532b6fbf4e8c6 Mon Sep 17 00:00:00 2001 From: "pixeebot[bot]" <104101892+pixeebot[bot]@users.noreply.github.com> Date: Mon, 6 May 2024 23:03:24 -0400 Subject: [PATCH] Sandbox URL Creation (#8) Co-authored-by: pixeebot[bot] <104101892+pixeebot[bot]@users.noreply.github.com> --- data/datasets/biostars_qa/get_biostars_dataset.py | 4 ++-- data/datasets/oa_dolly_15k/create_dataset.py | 5 ++--- data/datasets/oa_stackexchange/download.py | 3 ++- data/datasets/youtube_subs_howto100M/prepare.py | 5 ++--- data/datasets/zhihu-kol/main.py | 8 ++++---- data/datasets/zhihu-kol/scrape_by_topic.py | 4 ++-- inference/worker/chat_chain_utils.py | 3 ++- inference/worker/openapi_parser.py | 5 +++-- inference/worker/utils.py | 3 ++- model/model_training/custom_datasets/prompt_dialogue.py | 5 ++--- oasst-shared/pyproject.toml | 3 ++- scripts/data_augment/data_augment.py | 3 ++- 12 files changed, 27 insertions(+), 24 deletions(-) diff --git a/data/datasets/biostars_qa/get_biostars_dataset.py b/data/datasets/biostars_qa/get_biostars_dataset.py index 17ae220343..040ed87385 100644 --- a/data/datasets/biostars_qa/get_biostars_dataset.py +++ b/data/datasets/biostars_qa/get_biostars_dataset.py @@ -4,8 +4,8 @@ import time import pandas as pd -import requests from tqdm import tqdm +from security import safe_requests def get_biostars_dataset(start_idx=9557161, accept_threshold=1000000, sleep=0.1, folder="biostars"): @@ -41,7 +41,7 @@ def get_biostars_dataset(start_idx=9557161, accept_threshold=1000000, sleep=0.1, print(f"MSG: {file} exists. Skipping; Current accepted: {has_accepted_count}") continue - r = requests.get(url, headers=headers, timeout=60) + r = safe_requests.get(url, headers=headers, timeout=60) # print(r.status_code, r.reason) diff --git a/data/datasets/oa_dolly_15k/create_dataset.py b/data/datasets/oa_dolly_15k/create_dataset.py index e1f1f75581..8464296e0b 100644 --- a/data/datasets/oa_dolly_15k/create_dataset.py +++ b/data/datasets/oa_dolly_15k/create_dataset.py @@ -1,15 +1,14 @@ import json from pathlib import Path - -import requests from datasets import Dataset +from security import safe_requests DATA_URL = "https://raw.githubusercontent.com/databrickslabs/dolly/master/data/databricks-dolly-15k.jsonl" FILE_PATH = "databricks_dolly_15k.jsonl" def download_data(url: str, destination: str): - response = requests.get(url, stream=True, timeout=60) + response = safe_requests.get(url, stream=True, timeout=60) with open(destination, "wb") as handle: for data in response.iter_content(): diff --git a/data/datasets/oa_stackexchange/download.py b/data/datasets/oa_stackexchange/download.py index b90c2579f2..aef005ca2d 100755 --- a/data/datasets/oa_stackexchange/download.py +++ b/data/datasets/oa_stackexchange/download.py @@ -14,6 +14,7 @@ import requests from bs4 import BeautifulSoup as bs +from security import safe_requests BASE_URL = "https://ia600107.us.archive.org/view_archive.php?archive=/27/items/stackexchange/{0}&file=Posts.xml" DOWNLOAD_DIR = "xml/" @@ -50,7 +51,7 @@ def download_url(dataset_name: str, url: str): return cache_path else: print("Downloading xml: ", dataset_name) - response = requests.get(url, timeout=60) + response = safe_requests.get(url, timeout=60) print("Finished downloading: ", dataset_name) with open(cache_path, "wb") as f: f.write(response.content) diff --git a/data/datasets/youtube_subs_howto100M/prepare.py b/data/datasets/youtube_subs_howto100M/prepare.py index 6899841b7b..05bf1bb0a7 100644 --- a/data/datasets/youtube_subs_howto100M/prepare.py +++ b/data/datasets/youtube_subs_howto100M/prepare.py @@ -8,11 +8,10 @@ import urllib import zipfile from typing import List - -import requests from tqdm import tqdm from youtube_transcript_api import YouTubeTranscriptApi import secrets +from security import safe_requests def get_video_ids(raw_file: str, video_id_pattern: str) -> List[str]: @@ -74,7 +73,7 @@ def main(output_dir: str = "data"): print("Downloading HowTo100M raw_caption.zip...") print(" might take some time(3.4G)...") url = "https://www.rocq.inria.fr/cluster-willow/amiech/howto100m/raw_caption.zip" - response = requests.get(url, timeout=60) + response = safe_requests.get(url, timeout=60) zipped = zipfile.ZipFile(io.BytesIO(response.content)) zipped.extractall("./temp") diff --git a/data/datasets/zhihu-kol/main.py b/data/datasets/zhihu-kol/main.py index d554320124..1f2c0497c4 100644 --- a/data/datasets/zhihu-kol/main.py +++ b/data/datasets/zhihu-kol/main.py @@ -4,10 +4,10 @@ import multitasking import pandas as pd -import requests from bs4 import BeautifulSoup from retry import retry from tqdm import tqdm +from security import safe_requests def get_uid_by_url_token(url_token: str) -> str: @@ -41,7 +41,7 @@ def get_uid_by_url_token(url_token: str) -> str: } url = "https://api.zhihu.com/people/" + url_token - response = requests.get(url, headers=headers, timeout=60) + response = safe_requests.get(url, headers=headers, timeout=60) uid = response.json()["id"] return uid @@ -100,7 +100,7 @@ def get_user_answers(url_token: str, max_count: int = 100000) -> pd.DataFrame: ("offset", f"{offset}"), ) - response = requests.get(url, headers=headers, params=params, timeout=60) + response = safe_requests.get(url, headers=headers, params=params, timeout=60) if response.json().get("paging") is None: return pd.DataFrame(columns=operations.keys()) @@ -148,7 +148,7 @@ def get_answer_content(qid: str, aid) -> str: "Host": "www.zhihu.com", } url = f"https://www.zhihu.com/question/{qid}/answer/{aid}" - response = requests.get(url, headers=headers, timeout=60) + response = safe_requests.get(url, headers=headers, timeout=60) soup = BeautifulSoup(response.text, "html.parser") content = " ".join([p.text.strip() for p in soup.find_all("p")]) diff --git a/data/datasets/zhihu-kol/scrape_by_topic.py b/data/datasets/zhihu-kol/scrape_by_topic.py index dd8dc685d4..97aad2f8b7 100644 --- a/data/datasets/zhihu-kol/scrape_by_topic.py +++ b/data/datasets/zhihu-kol/scrape_by_topic.py @@ -8,11 +8,11 @@ import numpy as np import pandas as pd -import requests from bs4 import BeautifulSoup from loguru import logger from playwright.sync_api import Locator, Page, sync_playwright from tqdm import tqdm +from security import safe_requests @dataclass @@ -46,7 +46,7 @@ def get_answer_content(qid: int, aid: int, question_str: str) -> str: "Host": "www.zhihu.com", } url = f"https://www.zhihu.com/question/{qid}/answer/{aid}" - response = requests.get(url, headers=headers, timeout=60) + response = safe_requests.get(url, headers=headers, timeout=60) soup = BeautifulSoup(response.text, "html.parser") content = " ".join([p.text.strip() for p in soup.find_all("p")]) diff --git a/inference/worker/chat_chain_utils.py b/inference/worker/chat_chain_utils.py index 602c925333..0185ac7781 100644 --- a/inference/worker/chat_chain_utils.py +++ b/inference/worker/chat_chain_utils.py @@ -14,6 +14,7 @@ from openapi_parser import prepare_plugin_for_llm from settings import settings from utils import shared_tokenizer_lock, special_tokens +from security import safe_requests RESPONSE_MAX_LENGTH = 2048 DESCRIPTION_FOR_MODEL_MAX_LENGTH = 512 @@ -205,7 +206,7 @@ def run_request(self, params: str, url: str, param_location: str, type: str, pay logger.info( f"Running {type.upper()} request on {url} with\nparams: {params}\nparam_location: {param_location}\npayload: {payload}" ) - res = requests.get(url, params=query_params, headers=headers, timeout=60) + res = safe_requests.get(url, params=query_params, headers=headers, timeout=60) elif type.lower() == "post": # if model did not generate payload object, use params as payload data = json.dumps(payload) if payload else json.dumps(params) diff --git a/inference/worker/openapi_parser.py b/inference/worker/openapi_parser.py index 26a9f51eb4..136e75b343 100644 --- a/inference/worker/openapi_parser.py +++ b/inference/worker/openapi_parser.py @@ -5,10 +5,11 @@ import yaml from loguru import logger from oasst_shared.schemas import inference +from security import safe_requests def fetch_openapi_spec(url): - response = requests.get(url, timeout=60) + response = safe_requests.get(url, timeout=60) if response.status_code != 200: raise Exception(f"Failed to fetch data from URL: {url}. Status code: {response.status_code}") @@ -29,7 +30,7 @@ def fetch_openapi_spec(url): def get_plugin_config(url: str) -> inference.PluginConfig | None: try: - response = requests.get(url, timeout=60) + response = safe_requests.get(url, timeout=60) response.raise_for_status() plugin_dict = response.json() logger.info(f"Plugin config downloaded {plugin_dict}") diff --git a/inference/worker/utils.py b/inference/worker/utils.py index 2add1a6d94..4c9a61559f 100644 --- a/inference/worker/utils.py +++ b/inference/worker/utils.py @@ -14,6 +14,7 @@ from oasst_shared.schemas import inference from settings import settings import secrets +from security import safe_requests shared_tokenizer_lock = threading.Lock() @@ -258,7 +259,7 @@ def _maybe_add_bearer_token(self, headers: dict[str, str] | None): def get(self, path: str, **kwargs): kwargs["headers"] = self._maybe_add_bearer_token(kwargs.get("headers")) - return requests.get(self.base_url + path, auth=self.auth, **kwargs, timeout=60) + return safe_requests.get(self.base_url + path, auth=self.auth, **kwargs, timeout=60) def post(self, path: str, **kwargs): kwargs["headers"] = self._maybe_add_bearer_token(kwargs.get("headers")) diff --git a/model/model_training/custom_datasets/prompt_dialogue.py b/model/model_training/custom_datasets/prompt_dialogue.py index b3f41f8376..39816fcd25 100644 --- a/model/model_training/custom_datasets/prompt_dialogue.py +++ b/model/model_training/custom_datasets/prompt_dialogue.py @@ -3,14 +3,13 @@ import re from pathlib import Path from typing import List, Mapping, Optional, Sequence, Union - -import requests from datasets import load_dataset from model_training.custom_datasets.formatting import DatasetEntrySft, Role, Utterance from model_training.custom_datasets.oasst_dataset import ListDataset from model_training.custom_datasets.utils import _filter_by_words from torch import Generator, randperm from torch.utils.data import Dataset, random_split +from security import safe_requests def load_oig_file( @@ -34,7 +33,7 @@ def load_oig_file( # download file if not cached if not local_path.exists() or local_path.stat().st_size == 0 or no_cache: print(f"downloading {source_url} to {local_path}") - r = requests.get(source_url, stream=True, timeout=60) + r = safe_requests.get(source_url, stream=True, timeout=60) with local_path.open(mode="wb") as fd: for chunk in r.iter_content(chunk_size=1024 * 1024): fd.write(chunk) diff --git a/oasst-shared/pyproject.toml b/oasst-shared/pyproject.toml index baee4f9e18..b93695ce5e 100644 --- a/oasst-shared/pyproject.toml +++ b/oasst-shared/pyproject.toml @@ -11,7 +11,8 @@ dependencies = [ "aiohttp[speedups]", "loguru==0.6.0", "psutil==5.9.4", - "pynvml==11.5.0" + "pynvml==11.5.0", + "security==1.2.1" ] [project.optional-dependencies] diff --git a/scripts/data_augment/data_augment.py b/scripts/data_augment/data_augment.py index 6ba8213153..04e0992ba6 100644 --- a/scripts/data_augment/data_augment.py +++ b/scripts/data_augment/data_augment.py @@ -26,6 +26,7 @@ from syntax.syntax_injector import SyntaxBug from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, T5ForConditionalGeneration, pipeline import secrets +from security import safe_requests class DataAugmenter: @@ -245,7 +246,7 @@ def parse(self, _): xml_posts_path = urls.get(dataset_name) - response = requests.get(xml_posts_path, timeout=60) + response = safe_requests.get(xml_posts_path, timeout=60) df = self.xml_to_df(response) df = self.filter(df)