-
Notifications
You must be signed in to change notification settings - Fork 1
/
generate_datasets.py
50 lines (44 loc) · 1.63 KB
/
generate_datasets.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
import json
from concurrent.futures import ProcessPoolExecutor, as_completed
from pathlib import Path
from computation.doc2vec import doc2vec_integration
from textbooks.data import Textbook
from textbooks.utils import extract_content
def integrate_textbook(base_textbook_path, all_textbook_paths):
"""Function to integrate a single textbook"""
print(f"Integrating with base {base_textbook_path.stem}...")
result = (
base_textbook_path.stem,
doc2vec_integration(
base_textbook=Textbook.from_json(base_textbook_path),
other_textbooks=tuple(
Textbook.from_json(t)
for t in all_textbook_paths
if t != base_textbook_path
),
text_extraction_fn=extract_content,
threshold=0.4,
vector_size=100,
min_count=1,
epochs=40,
iterative=False,
evaluate=False,
).dataset,
)
print(f"Finished integrating with base {base_textbook_path.stem}.")
return result
if __name__ == "__main__":
all_textbook_paths = list(Path("textbooks-parsed").glob("*"))
datasets = {}
with ProcessPoolExecutor() as executor:
futures = {
executor.submit(
integrate_textbook, base_textbook_path, all_textbook_paths
): base_textbook_path
for base_textbook_path in all_textbook_paths
}
for future in as_completed(futures):
base_textbook, result = future.result()
datasets[base_textbook] = result
with open("datasets.json", "w", encoding="utf-8") as f:
json.dump(datasets, f)