Skip to content

Commit

Permalink
Sandbox URL Creation (#8)
Browse files Browse the repository at this point in the history
Co-authored-by: pixeebot[bot] <104101892+pixeebot[bot]@users.noreply.github.com>
  • Loading branch information
pixeebot[bot] authored May 7, 2024
1 parent 0d7e3ee commit 8fd7727
Show file tree
Hide file tree
Showing 12 changed files with 27 additions and 24 deletions.
4 changes: 2 additions & 2 deletions data/datasets/biostars_qa/get_biostars_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,8 @@
import time

import pandas as pd
import requests
from tqdm import tqdm
from security import safe_requests


def get_biostars_dataset(start_idx=9557161, accept_threshold=1000000, sleep=0.1, folder="biostars"):
Expand Down Expand Up @@ -41,7 +41,7 @@ def get_biostars_dataset(start_idx=9557161, accept_threshold=1000000, sleep=0.1,
print(f"MSG: {file} exists. Skipping; Current accepted: {has_accepted_count}")
continue

r = requests.get(url, headers=headers, timeout=60)
r = safe_requests.get(url, headers=headers, timeout=60)

# print(r.status_code, r.reason)

Expand Down
5 changes: 2 additions & 3 deletions data/datasets/oa_dolly_15k/create_dataset.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,14 @@
import json
from pathlib import Path

import requests
from datasets import Dataset
from security import safe_requests

DATA_URL = "https://raw.githubusercontent.com/databrickslabs/dolly/master/data/databricks-dolly-15k.jsonl"
FILE_PATH = "databricks_dolly_15k.jsonl"


def download_data(url: str, destination: str):
response = requests.get(url, stream=True, timeout=60)
response = safe_requests.get(url, stream=True, timeout=60)

with open(destination, "wb") as handle:
for data in response.iter_content():
Expand Down
3 changes: 2 additions & 1 deletion data/datasets/oa_stackexchange/download.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@

import requests
from bs4 import BeautifulSoup as bs
from security import safe_requests

BASE_URL = "https://ia600107.us.archive.org/view_archive.php?archive=/27/items/stackexchange/{0}&file=Posts.xml"
DOWNLOAD_DIR = "xml/"
Expand Down Expand Up @@ -50,7 +51,7 @@ def download_url(dataset_name: str, url: str):
return cache_path
else:
print("Downloading xml: ", dataset_name)
response = requests.get(url, timeout=60)
response = safe_requests.get(url, timeout=60)
print("Finished downloading: ", dataset_name)
with open(cache_path, "wb") as f:
f.write(response.content)
Expand Down
5 changes: 2 additions & 3 deletions data/datasets/youtube_subs_howto100M/prepare.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,10 @@
import urllib
import zipfile
from typing import List

import requests
from tqdm import tqdm
from youtube_transcript_api import YouTubeTranscriptApi
import secrets
from security import safe_requests


def get_video_ids(raw_file: str, video_id_pattern: str) -> List[str]:
Expand Down Expand Up @@ -74,7 +73,7 @@ def main(output_dir: str = "data"):
print("Downloading HowTo100M raw_caption.zip...")
print(" might take some time(3.4G)...")
url = "https://www.rocq.inria.fr/cluster-willow/amiech/howto100m/raw_caption.zip"
response = requests.get(url, timeout=60)
response = safe_requests.get(url, timeout=60)
zipped = zipfile.ZipFile(io.BytesIO(response.content))
zipped.extractall("./temp")

Expand Down
8 changes: 4 additions & 4 deletions data/datasets/zhihu-kol/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,10 @@

import multitasking
import pandas as pd
import requests
from bs4 import BeautifulSoup
from retry import retry
from tqdm import tqdm
from security import safe_requests


def get_uid_by_url_token(url_token: str) -> str:
Expand Down Expand Up @@ -41,7 +41,7 @@ def get_uid_by_url_token(url_token: str) -> str:
}

url = "https://api.zhihu.com/people/" + url_token
response = requests.get(url, headers=headers, timeout=60)
response = safe_requests.get(url, headers=headers, timeout=60)
uid = response.json()["id"]
return uid

Expand Down Expand Up @@ -100,7 +100,7 @@ def get_user_answers(url_token: str, max_count: int = 100000) -> pd.DataFrame:
("offset", f"{offset}"),
)

response = requests.get(url, headers=headers, params=params, timeout=60)
response = safe_requests.get(url, headers=headers, params=params, timeout=60)

if response.json().get("paging") is None:
return pd.DataFrame(columns=operations.keys())
Expand Down Expand Up @@ -148,7 +148,7 @@ def get_answer_content(qid: str, aid) -> str:
"Host": "www.zhihu.com",
}
url = f"https://www.zhihu.com/question/{qid}/answer/{aid}"
response = requests.get(url, headers=headers, timeout=60)
response = safe_requests.get(url, headers=headers, timeout=60)

soup = BeautifulSoup(response.text, "html.parser")
content = " ".join([p.text.strip() for p in soup.find_all("p")])
Expand Down
4 changes: 2 additions & 2 deletions data/datasets/zhihu-kol/scrape_by_topic.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,11 @@

import numpy as np
import pandas as pd
import requests
from bs4 import BeautifulSoup
from loguru import logger
from playwright.sync_api import Locator, Page, sync_playwright
from tqdm import tqdm
from security import safe_requests


@dataclass
Expand Down Expand Up @@ -46,7 +46,7 @@ def get_answer_content(qid: int, aid: int, question_str: str) -> str:
"Host": "www.zhihu.com",
}
url = f"https://www.zhihu.com/question/{qid}/answer/{aid}"
response = requests.get(url, headers=headers, timeout=60)
response = safe_requests.get(url, headers=headers, timeout=60)

soup = BeautifulSoup(response.text, "html.parser")
content = " ".join([p.text.strip() for p in soup.find_all("p")])
Expand Down
3 changes: 2 additions & 1 deletion inference/worker/chat_chain_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
from openapi_parser import prepare_plugin_for_llm
from settings import settings
from utils import shared_tokenizer_lock, special_tokens
from security import safe_requests

RESPONSE_MAX_LENGTH = 2048
DESCRIPTION_FOR_MODEL_MAX_LENGTH = 512
Expand Down Expand Up @@ -205,7 +206,7 @@ def run_request(self, params: str, url: str, param_location: str, type: str, pay
logger.info(
f"Running {type.upper()} request on {url} with\nparams: {params}\nparam_location: {param_location}\npayload: {payload}"
)
res = requests.get(url, params=query_params, headers=headers, timeout=60)
res = safe_requests.get(url, params=query_params, headers=headers, timeout=60)
elif type.lower() == "post":
# if model did not generate payload object, use params as payload
data = json.dumps(payload) if payload else json.dumps(params)
Expand Down
5 changes: 3 additions & 2 deletions inference/worker/openapi_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,11 @@
import yaml
from loguru import logger
from oasst_shared.schemas import inference
from security import safe_requests


def fetch_openapi_spec(url):
response = requests.get(url, timeout=60)
response = safe_requests.get(url, timeout=60)
if response.status_code != 200:
raise Exception(f"Failed to fetch data from URL: {url}. Status code: {response.status_code}")

Expand All @@ -29,7 +30,7 @@ def fetch_openapi_spec(url):

def get_plugin_config(url: str) -> inference.PluginConfig | None:
try:
response = requests.get(url, timeout=60)
response = safe_requests.get(url, timeout=60)
response.raise_for_status()
plugin_dict = response.json()
logger.info(f"Plugin config downloaded {plugin_dict}")
Expand Down
3 changes: 2 additions & 1 deletion inference/worker/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
from oasst_shared.schemas import inference
from settings import settings
import secrets
from security import safe_requests

shared_tokenizer_lock = threading.Lock()

Expand Down Expand Up @@ -258,7 +259,7 @@ def _maybe_add_bearer_token(self, headers: dict[str, str] | None):

def get(self, path: str, **kwargs):
kwargs["headers"] = self._maybe_add_bearer_token(kwargs.get("headers"))
return requests.get(self.base_url + path, auth=self.auth, **kwargs, timeout=60)
return safe_requests.get(self.base_url + path, auth=self.auth, **kwargs, timeout=60)

def post(self, path: str, **kwargs):
kwargs["headers"] = self._maybe_add_bearer_token(kwargs.get("headers"))
Expand Down
5 changes: 2 additions & 3 deletions model/model_training/custom_datasets/prompt_dialogue.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,14 +3,13 @@
import re
from pathlib import Path
from typing import List, Mapping, Optional, Sequence, Union

import requests
from datasets import load_dataset
from model_training.custom_datasets.formatting import DatasetEntrySft, Role, Utterance
from model_training.custom_datasets.oasst_dataset import ListDataset
from model_training.custom_datasets.utils import _filter_by_words
from torch import Generator, randperm
from torch.utils.data import Dataset, random_split
from security import safe_requests


def load_oig_file(
Expand All @@ -34,7 +33,7 @@ def load_oig_file(
# download file if not cached
if not local_path.exists() or local_path.stat().st_size == 0 or no_cache:
print(f"downloading {source_url} to {local_path}")
r = requests.get(source_url, stream=True, timeout=60)
r = safe_requests.get(source_url, stream=True, timeout=60)
with local_path.open(mode="wb") as fd:
for chunk in r.iter_content(chunk_size=1024 * 1024):
fd.write(chunk)
Expand Down
3 changes: 2 additions & 1 deletion oasst-shared/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,8 @@ dependencies = [
"aiohttp[speedups]",
"loguru==0.6.0",
"psutil==5.9.4",
"pynvml==11.5.0"
"pynvml==11.5.0",
"security==1.2.1"
]

[project.optional-dependencies]
Expand Down
3 changes: 2 additions & 1 deletion scripts/data_augment/data_augment.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
from syntax.syntax_injector import SyntaxBug
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, T5ForConditionalGeneration, pipeline
import secrets
from security import safe_requests


class DataAugmenter:
Expand Down Expand Up @@ -245,7 +246,7 @@ def parse(self, _):

xml_posts_path = urls.get(dataset_name)

response = requests.get(xml_posts_path, timeout=60)
response = safe_requests.get(xml_posts_path, timeout=60)
df = self.xml_to_df(response)
df = self.filter(df)

Expand Down

0 comments on commit 8fd7727

Please sign in to comment.