-
Notifications
You must be signed in to change notification settings - Fork 0
/
crawler.py
74 lines (62 loc) · 2.38 KB
/
crawler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
import grequests
from lxml import html
import time
import json
# # read the top k pages from file "top_k.txt"
# with open("top_k_ids.txt", "r") as f:
# pages = f.readlines()
# # generate urls for each page
# urls = dict()
# for page in pages:
# actual_page = int(page)//100
# row = int(page)%100 + 1
# url = "https://huggingface.co/datasets/Cohere/wikipedia-22-12-simple-embeddings/viewer/default/train?p=" + str(actual_page)
# if url not in urls:
# urls[url] = list()
# urls[url].append(row)
# start = time.time()
# # send requests to urls
# rs = (grequests.get(u) for u in urls)
# responses = grequests.map(rs)
# top_k_data = list()
# # parse the response and get the top k data [text]
# for response in responses:
# content = response.text
# parsed = html.fromstring(content)
# # get the rows in urls[response.url]
# rows = parsed.xpath("/html/body/div[1]/main/div/div/div/div[2]/div/table/tbody/tr")
# for row in urls[response.url]:
# text = rows[row-1].xpath("td[3]/div/div/text()")[0]
# top_k_data.append(text)
# end = time.time()
# print("Time taken to crawl the top K: ",end - start) #time taken to run the code
# # write the data to file
# with open("top_k_data.txt", "w") as f:
# for data in top_k_data:
# f.write(data + "\n")
# import requests
# headers = {"Authorization": f"Bearer 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"}
# API_URL = "https://datasets-server.huggingface.co/rows?dataset=wiki_dpr&config=psgs_w100.multiset.exact&split=train&offset=9000000&length=1"
# def query():
# response = requests.get(API_URL, headers=headers)
# return response.json()
# data = query()
# # with open("nashaat.txt","w") as f:
# # json.dump(data, f)
# print(json.dumps(data, indent=4))
import pandas as pd
from datasets import load_dataset
from huggingface_hub import HfFileSystem
token = "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"
fs = HfFileSystem(token=token)
print("authenticated")
fs.ls("datasets/wiki_dpr/blob")
# start = time.time()
# # load the dataset
# base_url = "https://huggingface.co/datasets/wiki_dpr/blob/main/data/psgs_w100/nq/train-00000-of-00157.parquet"
# data_files = {"train": base_url }
# dataset = load_dataset("parquet", data_files=data_files, split="train")
# end = time.time()
# print("Time taken to read the file: ",end - start) #time taken to run the code
# # print the row whose id is 10
# print(dataset[10])