Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Modified model architecture #126

Open
wants to merge 7 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 12 additions & 3 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,13 @@ ENV/

# data
dataset/
datasets/
graphormer/data/pyg_datasets/datasets/
examples/property_prediction/dataset/
examples/property_prediction/datasets/
graphormer/data/pyg_datasets/test.py
graphormer/data/pyg_datasets/test.tar
fs-mol/

# reranking
/examples/reranking/rerank_data
Expand All @@ -128,12 +135,14 @@ exps
# Weights and Biases logs
wandb/


*.pyc
*.log
ckpts
examples/dataset
examples/property_prediction/ckpts
#examples/property_prediction/dataset

!examples/property_prediction/dataset/pcqm4m-v2/RELEASE_v1.txt
!examples/property_prediction/dataset/pcqm4m_kddcup2021/RELEASE_v1.txt

# for self-testing
examples/property_prediction/pcqv2_pyg.sh
examples/property_prediction/fs_mol.sh
35 changes: 35 additions & 0 deletions examples/property_prediction/fs_mol.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
#!/usr/bin/env bash
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.

n_gpu=4
epoch=300
max_epoch=$((epoch + 1))
batch_size=64
tot_updates=$((500000*epoch/batch_size/n_gpu))
warmup_updates=$((tot_updates/10))

CUDA_VISIBLE_DEVICES=0,1 fairseq-train \
--user-dir ./graphormer \
--num-workers 16 \
--ddp-backend=legacy_ddp \
--dataset-name fsmol \
--dataset-source pyg \
--task graph_prediction \
--criterion binary_logloss \
--arch graphormer_base \
--num-classes 5135 \
--attention-dropout 0.1 --act-dropout 0.1 --dropout 0.0 \
--optimizer adam --adam-betas '(0.9, 0.999)' --adam-eps 1e-8 --clip-norm 5.0 --weight-decay 0.0 \
--lr-scheduler polynomial_decay --power 1 --warmup-updates ${warmup_updates} --total-num-update ${tot_updates} \
--lr 2e-4 --end-learning-rate 1e-9 \
--batch-size ${batch_size} \
--data-buffer-size 20 \
--encoder-layers 12 \
--encoder-embed-dim 768 \
--encoder-ffn-embed-dim 768 \
--encoder-attention-heads 32 \
--max-epoch ${max_epoch} \
--no-save \
--sandwich-norm \
--fp16
2 changes: 1 addition & 1 deletion examples/property_prediction/hiv_pre.sh
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ CUDA_VISIBLE_DEVICES=3 fairseq-train \
--dataset-source ogb \
--task graph_prediction_with_flag \
--criterion binary_logloss_with_flag \
--arch graphormer_base \
--arch graphormer_graphpred_base \
--num-classes 1 \
--attention-dropout 0.1 --act-dropout 0.1 --dropout 0.0 \
--optimizer adam --adam-betas '(0.9, 0.999)' --adam-eps 1e-8 --clip-norm 5.0 --weight-decay 0.0 \
Expand Down
2 changes: 1 addition & 1 deletion examples/property_prediction/pcqv1.sh
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ fairseq-train \
--dataset-source ogb \
--task graph_prediction \
--criterion l1_loss \
--arch graphormer_base \
--arch graphormer_graphpred_base \
--num-classes 1 \
--attention-dropout 0.1 --act-dropout 0.1 --dropout 0.0 \
--optimizer adam --adam-betas '(0.9, 0.999)' --adam-eps 1e-8 --clip-norm 5.0 --weight-decay 0.0 \
Expand Down
2 changes: 1 addition & 1 deletion examples/property_prediction/pcqv2.sh
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ fairseq-train \
--dataset-source ogb \
--task graph_prediction \
--criterion l1_loss \
--arch graphormer_base \
--arch graphormer_graphpred_base \
--num-classes 1 \
--attention-dropout 0.1 --act-dropout 0.1 --dropout 0.0 \
--optimizer adam --adam-betas '(0.9, 0.999)' --adam-eps 1e-8 --clip-norm 5.0 --weight-decay 0.0 \
Expand Down
25 changes: 25 additions & 0 deletions examples/property_prediction/pcqv2_pyg.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
#!/usr/bin/env bash
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.

ulimit -c unlimited

fairseq-train \
--user-dir ../../graphormer \
--num-workers 16 \
--ddp-backend=legacy_ddp \
--dataset-name pcqm4mv2_pyg \
--dataset-source pyg \
--task graph_prediction \
--criterion l1_loss \
--arch graphormer_graphpred_base \
--num-classes 1 \
--attention-dropout 0.1 --act-dropout 0.1 --dropout 0.0 \
--optimizer adam --adam-betas '(0.9, 0.999)' --adam-eps 1e-8 --clip-norm 5.0 --weight-decay 0.0 \
--lr-scheduler polynomial_decay --power 1 --warmup-updates 60000 --total-num-update 1000000 \
--lr 2e-4 --end-learning-rate 1e-9 \
--batch-size 256 \
--fp16 \
--data-buffer-size 20 \
--no-save
#--save-dir ./ckpts
2 changes: 1 addition & 1 deletion examples/property_prediction/zinc.sh
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ CUDA_VISIBLE_DEVICES=0 fairseq-train \
--dataset-source pyg \
--task graph_prediction \
--criterion l1_loss \
--arch graphormer_slim \
--arch graphormer_graphpred_slim \
--num-classes 1 \
--attention-dropout 0.1 --act-dropout 0.1 --dropout 0.0 \
--optimizer adam --adam-betas '(0.9, 0.999)' --adam-eps 1e-8 --clip-norm 5.0 --weight-decay 0.01 \
Expand Down
2 changes: 1 addition & 1 deletion graphormer/criterions/l1_loss.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ def forward(self, model, sample, reduce=True):
natoms = sample["net_input"]["batched_data"]["x"].shape[1]

logits = model(**sample["net_input"])
logits = logits[:, 0, :]
#logits = logits[:, 0, :] # B x C
targets = model.get_targets(sample, [logits])

loss = nn.L1Loss(reduction="sum")(logits, targets[: logits.size(0)])
Expand Down
127 changes: 127 additions & 0 deletions graphormer/data/pyg_datasets/fsmol.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,127 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.

import os
import os.path as osp
import shutil
from ogb.utils import smiles2graph
from ogb.utils.torch_util import replace_numpy_with_torchtensor
from ogb.utils.url import decide_download, download_url, extract_zip
import pandas as pd
import numpy as np
from tqdm import tqdm
import torch

from torch_geometric.data import InMemoryDataset, Data

import tarfile
import jsonlines
import gzip


# few-shot not implemented yet
class FSmolPYG(InMemoryDataset):
def __init__(
self,
root,
split,
seed: int = 0,
transform=None,
pre_transform=None
) -> None:
assert split in ['train', 'valid', 'test']
self.split = split
self.original_root = root
self.url = 'https://figshare.com/ndownloader/files/31345321'
self.task_to_head = {}
self.num_heads_total = 0
super().__init__(root, transform, pre_transform)

# calculate the total number of heads & construct the task_to_head dict
# path: datasets/fsmol/raw/train
path = osp.join(self.raw_dir(), split)
self.unzip_gz_and_calc_head(path)
self.data, self.slices = torch.load(self.processed_file_names)

def unzip_gz_and_calc_head(self, path):
if os.path.exists(path):
head_num = 0
dirs = os.listdir(path)
for dir in dirs:
if '.gz' in dir:
filename = dir.replace(".gz","")
assert filename not in self.task_to_head, f"Duplicated task {filename} in split {self.split}!"
self.task_to_head['filename'] = head_num
head_num += 1
gzip_file = gzip.GzipFile(path + dir)
with open(path + filename,'wb+') as f:
f.write(gzip_file.read())
for dir in dirs: # delete .gz files
if '.gz' in dir:
os.unlink(dir)
self.num_heads_total = head_num
else:
raise Exception("The file to unzip does not exist!")

@property
def raw_dir(self): # datasets/fsmol/raw/train
return f"{self.root}/fsmol/raw"

@property
def processed_dir(self):
return f"{self.root}/fsmol/processed"

@property
def raw_file_names(self):
return 'fsmol.tar'

@property
def processed_file_names(self):
return f'{self.split}.pt'

def download(self):
# Download fsmol.tar to `self.raw_dir` & unzip the file.
# datasets/raw/fsmol/train
path = download_url(self.url, self.original_root)
tar = tarfile.open(path)
tar.extractall()
tar.close()
# os.unlink(path) # keep the tar file

def process(self):
# Read data into huge `Data` list.
path = osp.join(self.raw_dir(), self.split)
data_list = []
dirs = os.listdir(path)
for dir in dirs:
with open(dir, "r+", encoding="utf8") as f:
filename = dir.replace(".gz","")
head = self.task_to_head[filename]
for item in jsonlines.Reader(f):
data = Data()
data.head = head
data.smiles = item["SMILES"]
data.y = -1 if item["Property"] == 0.0 else head
data_list.append(data)

print(f"Converting SMILES strings to graphs in split '{self.split}':")
for i, data in enumerate(tqdm(data_list)):
graph = self.smiles2graph(data.smiles)
data.x = torch.from_numpy(graph['node_feat']).to(torch.int64)
data.edge_index = torch.from_numpy(graph['edge_index']).to(torch.int64)
data.edge_attr = torch.from_numpy(graph['edge_feat']).to(torch.int64)
del data.smiles

data, slices = self.collate(data_list)
torch.save((data, slices), self.processed_paths[0])


if __name__ == '__main__':
dataset = FSmolPYG()
print(dataset)
print(dataset.data.edge_index)
print(dataset.data.edge_index.shape)
print(dataset.data.x.shape)
print(dataset[100])
print(dataset[100].y)
print(dataset.get_idx_split())
85 changes: 85 additions & 0 deletions graphormer/data/pyg_datasets/pcqv2_pyg.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.

import os
import os.path as osp
import shutil
from ogb.utils import smiles2graph
from ogb.utils.torch_util import replace_numpy_with_torchtensor
from ogb.utils.url import decide_download, download_url, extract_zip
import pandas as pd
import numpy as np
from tqdm import tqdm
import torch

from torch_geometric.data import InMemoryDataset, Data

class PCQv2PYG(InMemoryDataset):
def __init__(self, root='datasets', smiles2graph = smiles2graph, transform=None, pre_transform=None):
self.folder = osp.join(root, 'pcqm4m-v2')
self.original_root = root
self.smiles2graph = smiles2graph
self.url = 'https://dgl-data.s3-accelerate.amazonaws.com/dataset/OGB-LSC/pcqm4m-v2.zip'
super().__init__(self.folder, transform, pre_transform)
self.data, self.slices = torch.load(self.processed_paths[0])

@property
def raw_file_names(self):
return 'data.csv.gz'

@property
def processed_file_names(self):
return 'geometric_data_processed.pt'

def download(self):
# Download to `self.raw_dir`.
path = download_url(self.url, self.original_root)
extract_zip(path, self.original_root)
os.unlink(path)

def process(self):
# Read data into huge `Data` list.
df = pd.read_csv(osp.join(self.raw_dir, 'data.csv.gz'))
smiles_list = df['smiles'][:20000]
homolumogap_list = df['homolumogap'][:20000]
data_list = []

print("Converting SMILES strings to graphs...")
for i in tqdm(range(len(smiles_list))):
data = Data()
graph = self.smiles2graph(smiles_list[i])
homolumogap = homolumogap_list[i]
data.x = torch.from_numpy(graph['node_feat']).to(torch.int64)
data.edge_index = torch.from_numpy(graph['edge_index']).to(torch.int64)
data.edge_attr = torch.from_numpy(graph['edge_feat']).to(torch.int64)
data.y = torch.Tensor([homolumogap])
data_list.append(data)

# double check NaN values
# split_dict = self.get_idx_split()
# assert(all([not torch.isnan(data_list[i].y)[0] for i in split_dict['train']]))
# assert(all([not torch.isnan(data_list[i].y)[0] for i in split_dict['valid']]))
# assert(all([torch.isnan(data_list[i].y)[0] for i in split_dict['test-dev']]))
# assert(all([torch.isnan(data_list[i].y)[0] for i in split_dict['test-challenge']]))

data, slices = self.collate(data_list)
torch.save((data, slices), self.processed_paths[0])

def get_idx_split(self):
# split_dict = replace_numpy_with_torchtensor(torch.load(osp.join(self.root, 'split_dict.pt')))
# return split_dict
split_dict = {'train': None, 'valid': None, 'test-dev': None}
split_dict['train'] = torch.from_numpy(np.arange(0, 16000)).to(torch.int64)
split_dict['valid'] = torch.from_numpy(np.arange(16000, 18000)).to(torch.int64)
split_dict['test-dev'] = torch.from_numpy(np.arange(18000, 20000)).to(torch.int64)
return split_dict

if __name__ == '__main__':
dataset = PCQv2PYG()
print(dataset)
print(dataset.data.edge_index)
print(dataset.data.edge_index.shape)
print(dataset.data.x.shape)
print(dataset[100])
print(dataset[100].y)
print(dataset.get_idx_split())
Loading