Skip to content

Commit

Permalink
manufact uniq
Browse files Browse the repository at this point in the history
  • Loading branch information
Sangboom committed Nov 11, 2024
1 parent 7358971 commit cd4622a
Show file tree
Hide file tree
Showing 46 changed files with 10,972 additions and 31 deletions.
4 changes: 3 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -13,4 +13,6 @@ results_polyformer_b
weights

*.pt
*.png
*.png
*.pyc
*/__pycache__/*
Binary file modified __pycache__/trainer.cpython-37.pyc
Binary file not shown.
Binary file modified bert/__pycache__/activations.cpython-37.pyc
Binary file not shown.
Binary file modified bert/__pycache__/configuration_bert.cpython-37.pyc
Binary file not shown.
Binary file modified bert/__pycache__/configuration_utils.cpython-37.pyc
Binary file not shown.
Binary file modified bert/__pycache__/file_utils.cpython-37.pyc
Binary file not shown.
Binary file modified bert/__pycache__/generation_utils.cpython-37.pyc
Binary file not shown.
Binary file modified bert/__pycache__/modeling_bert.cpython-37.pyc
Binary file not shown.
Binary file modified bert/__pycache__/modeling_utils.cpython-37.pyc
Binary file not shown.
Binary file modified bert/__pycache__/tokenization_bert.cpython-37.pyc
Binary file not shown.
Binary file modified bert/__pycache__/tokenization_utils.cpython-37.pyc
Binary file not shown.
Binary file modified bert/__pycache__/tokenization_utils_base.cpython-37.pyc
Binary file not shown.
Binary file modified criterions/__pycache__/__init__.cpython-37.pyc
Binary file not shown.
Binary file not shown.
Binary file modified data/__pycache__/__init__.cpython-37.pyc
Binary file not shown.
Binary file modified data/__pycache__/base_dataset.cpython-37.pyc
Binary file not shown.
Binary file modified data/__pycache__/data_utils.cpython-37.pyc
Binary file not shown.
Binary file modified data/__pycache__/file_dataset.cpython-37.pyc
Binary file not shown.
Binary file modified data/__pycache__/poly_utils.cpython-37.pyc
Binary file not shown.
Binary file modified data/__pycache__/refcoco_dataset.cpython-37.pyc
Binary file not shown.
Binary file modified data/__pycache__/refcoco_pretrain_dataset.cpython-37.pyc
Binary file not shown.
12 changes: 6 additions & 6 deletions data/create_aihub_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,8 +70,8 @@
splits = ['train', 'val']
splitBy = None
elif dataset == 'aihub_manufact':
# splits = ['val', 'test']
splits = ['train', 'val', 'test']
splits = ['val', 'test']
# splits = ['train', 'val', 'test']
splitBy = None

save_dir = f'datasets/finetune/{dataset}_bbox_fix'
Expand All @@ -91,10 +91,10 @@
writer = open(file_name, 'w')
refer = REFER(data_root, dataset, splitBy)

if dataset == 'aihub_manufact' and split == 'val':
ref_ids = refer.getRefIds(split='validation')
else:
ref_ids = refer.getRefIds(split=split)
# if dataset == 'aihub_manufact' and split == 'val':
# ref_ids = refer.getRefIds(split='validation')
# else:
ref_ids = refer.getRefIds(split=split)

for this_ref_id in tqdm(ref_ids):
this_img_id = refer.getImgIds(this_ref_id)
Expand Down
49 changes: 36 additions & 13 deletions data/create_pretraining_aihub_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,22 +7,20 @@
import glob


img_path = 'refer/data/aihub_refcoco_format/indoor_80/images'
# img_path = 'refer/data/aihub_refcoco_format/manufact_80/images'
# img_path = 'refer/data/aihub_refcoco_format/indoor_80/images'
img_path = 'refer/data/aihub_refcoco_format/manufact_80/images'

# load annotation files
# f = open("datasets/annotations/instances.json")
# f = open("refer/data/aihub_refcoco_format/indoor_80/instances.json")
f = open("refer/data/aihub_refcoco_format/manufact_80/instances_2.json")
f = open("refer/data/aihub_refcoco_format/indoor_80/instances_2.json")
# f = open("refer/data/aihub_refcoco_format/manufact_80/instances.json")
# f = open("refer/data/aihub_refcoco_format/indoor_80/instances_2.json")
print("Loading annotation file")
data = json.load(f)
f.close()

# Define the directory containing your CSV files
csv_dir = 'data/aihub_csv_error_csv/indoor' # Replace with the actual directory path
# csv_dir = 'data/aihub_csv_error_csv/manufact' # Replace with the actual directory path
# csv_dir = 'data/aihub_csv_error_csv/indoor' # Replace with the actual directory path
csv_dir = 'data/aihub_csv_error_csv/manufact' # Replace with the actual directory path
csv_files = glob.glob(f'{csv_dir}/*.csv')

# Initialize an empty dictionary to store bounding box values from all CSV files
Expand Down Expand Up @@ -60,7 +58,7 @@
print(len(data['annotations']))

# ref_file = 'refer/data/aihub_refcoco_format/indoor_80/refs.p'
ref_file = 'refer/data/aihub_refcoco_format/manufact_80/refs.p'
ref_file = 'refer/data/aihub_refcoco_format/manufact_80/refs_2.p'
ref_ann = pickle.load(open(ref_file, 'rb'))
print(ref_ann[10])
print(ref_ann[1])
Expand Down Expand Up @@ -190,7 +188,7 @@

# ref_ann_i = next((d for d in ref_ann if d["ref_id"] == str(i)), None)
# ref_ann_i = ref_ann[i]
if ref_ann_i['split'] == 'validation':
if ref_ann_i['split'] == 'val':
# print("val!!")
pass
else:
Expand All @@ -205,10 +203,35 @@
height, width = img_dict_i['height'], img_dict_i['width']

try:
x, y, w, h = bbox
box_string = f'{x},{y},{x + w},{y + h}'
fn = img_dict_i['file_name']
img_id = fn.split(".")[0].split("_")[-1]

# Determine the appropriate prefix for file_name_key
prefix = fn.split(".")[0].split("_")[0] + "_"
file_name_key = f"{prefix}{img_id}"
# load box
if file_name_key in bbox_dict:
print('bbox dict')
# Update bbox value based on CSV data
x1, y1, x2, y2 = map(int, bbox_dict[file_name_key].split(','))
box_string = f'{x1},{y1},{x2},{y2}'
else:
# prefix = img_dict_i['file_name'].split('_')[0]
# print(prefix)
# box = refer.getRefBox(this_ref_id) # x,y,w,h
# Fallback to the default logic if not in combined CSV data
if prefix == "real_":
x, y, w, h = bbox
box_string = f'{x},{y},{x + w},{y + h}'
elif prefix == "syn_":
x1, y1, x2, y2 = bbox
box_string = f'{x1},{y1},{x2},{y2}'
else:
print("Image must be either real or syn")
exit()
except TypeError:
print(bbox)
# print(bbox)
print(ann_i)
continue

img_name = img_dict_i['file_name']
Expand All @@ -222,5 +245,5 @@
writer.writelines(lines)
writer.close()

print("train_idx", train_idx)
# print("train_idx", train_idx)
print('val_idx', val_idx)
Binary file modified models/__pycache__/__init__.cpython-37.pyc
Binary file not shown.
Binary file modified models/polyformer/__pycache__/__init__.cpython-37.pyc
Binary file not shown.
Binary file modified models/polyformer/__pycache__/polyformer.cpython-37.pyc
Binary file not shown.
Binary file modified models/polyformer/__pycache__/swin.cpython-37.pyc
Binary file not shown.
Binary file not shown.
Binary file modified models/polyformer/__pycache__/unify_transformer.cpython-37.pyc
Binary file not shown.
Binary file not shown.
Binary file modified polyformer_module/__pycache__/__init__.cpython-37.pyc
Binary file not shown.
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,9 @@ model='polyformer_b'
num_bins=64
batch_size=16

dataset='aihub_manufact'
ckpt_path=../finetune/polyformer_b_aihub_manufact_checkpoints/100_5e-5_512/checkpoint_last.pt
dataset='aihub_manufact_bbox_fix'
# ckpt_path=../finetune/polyformer_b_aihub_manufact_checkpoints/100_5e-5_512/checkpoint_last.pt
ckpt_path=../finetune/polyformer_b_aihub_manufact_80_uniq_checkpoints/100_5e-5_512/checkpoint_epoch_55.pt
# dataset='refcocog'
# ckpt_path=../../weights/polyformer_b_refcocog.pt

Expand All @@ -45,5 +46,6 @@ python3 -m torch.distributed.launch --nproc_per_node=${GPUS_PER_NODE} --master_p
--num-bins=${num_bins} \
--vis_dir=${vis_dir} \
--result_dir=${result_dir} \
--model-overrides="{\"data\":\"${data}\",\"bpe_dir\":\"${bpe_dir}\",\"selected_cols\":\"${selected_cols}\"}"
--model-overrides="{\"data\":\"${data}\",\"bpe_dir\":\"${bpe_dir}\",\"selected_cols\":\"${selected_cols}\"}" \
# --vis
done
Original file line number Diff line number Diff line change
Expand Up @@ -2332,3 +2332,137 @@ slice_id 0 seek offset 0
2024-11-08 10:32:42 - train.py[line:297] - INFO: Start iterating over samples
2024-11-08 10:32:50 - progress_bar.py[line:274] - INFO: epoch 011: 1 / 391 loss=0.001, loss_v1=0, loss_v2=0, nll_loss=2.985, ntokens=16977.9, nsentences=985.6, sample_size=985.6, sample_size_v1=0, sample_size_v2=0, ppl=7.92, wps=347.2, ups=0.02, wpb=16977.9, bsz=985.6, num_updates=3910, lr=4.78723e-05, gnorm=0.008, clip=0, loss_scale=8192, train_wall=29, gb_free=13.3, wall=25183
2024-11-08 10:33:43 - progress_bar.py[line:274] - INFO: epoch 011: 11 / 391 loss=0.001, loss_v1=0, loss_v2=0, nll_loss=2.874, ntokens=17466.2, nsentences=1024, sample_size=1024, sample_size_v1=0, sample_size_v2=0, ppl=7.33, wps=3300.5, ups=0.19, wpb=17466.2, bsz=1024, num_updates=3920, lr=4.78587e-05, gnorm=0.008, clip=0, loss_scale=8192, train_wall=30, gb_free=13.5, wall=25235
2024-11-08 10:34:36 - progress_bar.py[line:274] - INFO: epoch 011: 21 / 391 loss=0.001, loss_v1=0, loss_v2=0, nll_loss=2.875, ntokens=17232.2, nsentences=1024, sample_size=1024, sample_size_v1=0, sample_size_v2=0, ppl=7.34, wps=3261.5, ups=0.19, wpb=17232.2, bsz=1024, num_updates=3930, lr=4.78451e-05, gnorm=0.011, clip=0, loss_scale=8192, train_wall=31, gb_free=13.6, wall=25288
2024-11-08 10:35:31 - progress_bar.py[line:274] - INFO: epoch 011: 31 / 391 loss=0.001, loss_v1=0, loss_v2=0, nll_loss=2.849, ntokens=17421.6, nsentences=1024, sample_size=1024, sample_size_v1=0, sample_size_v2=0, ppl=7.2, wps=3152.5, ups=0.18, wpb=17421.6, bsz=1024, num_updates=3940, lr=4.78315e-05, gnorm=0.01, clip=0, loss_scale=8192, train_wall=49, gb_free=13.1, wall=25344
2024-11-08 10:36:26 - progress_bar.py[line:274] - INFO: epoch 011: 41 / 391 loss=0.001, loss_v1=0, loss_v2=0, nll_loss=2.854, ntokens=17403.5, nsentences=1024, sample_size=1024, sample_size_v1=0, sample_size_v2=0, ppl=7.23, wps=3160.4, ups=0.18, wpb=17403.5, bsz=1024, num_updates=3950, lr=4.78179e-05, gnorm=0.008, clip=0, loss_scale=8192, train_wall=55, gb_free=13.3, wall=25399
2024-11-08 10:37:22 - progress_bar.py[line:274] - INFO: epoch 011: 51 / 391 loss=0.001, loss_v1=0, loss_v2=0, nll_loss=2.836, ntokens=17456.1, nsentences=1024, sample_size=1024, sample_size_v1=0, sample_size_v2=0, ppl=7.14, wps=3151.8, ups=0.18, wpb=17456.1, bsz=1024, num_updates=3960, lr=4.78043e-05, gnorm=0.007, clip=0, loss_scale=8192, train_wall=55, gb_free=13.5, wall=25454
Traceback (most recent call last):
File "../../train.py", line 543, in <module>
cli_main()
File "../../train.py", line 536, in cli_main
Traceback (most recent call last):
File "../../train.py", line 543, in <module>
distributed_utils.call_main(cfg, main)
File "/home/ubuntu/src/fairseq/fairseq/fairseq/distributed/utils.py", line 374, in call_main
distributed_main(cfg.distributed_training.device_id, main, cfg, kwargs)
File "/home/ubuntu/src/fairseq/fairseq/fairseq/distributed/utils.py", line 348, in distributed_main
main(cfg, **kwargs)
File "../../train.py", line 190, in main
valid_losses, should_stop = train(cfg, trainer, task, epoch_itr)
File "/home/ubuntu/anaconda3/envs/polyformer/lib/python3.7/contextlib.py", line 74, in inner
cli_main()
return func(*args, **kwds) File "../../train.py", line 536, in cli_main

File "../../train.py", line 298, in train
for i, samples in enumerate(progress):
File "/home/ubuntu/src/fairseq/fairseq/fairseq/logging/progress_bar.py", line 261, in __iter__
distributed_utils.call_main(cfg, main)
File "/home/ubuntu/src/fairseq/fairseq/fairseq/distributed/utils.py", line 374, in call_main
for i, obj in enumerate(self.iterable, start=self.n):
File "/home/ubuntu/src/fairseq/fairseq/fairseq/data/iterators.py", line 56, in __next__
x = next(self._itr)
File "/home/ubuntu/src/fairseq/fairseq/fairseq/data/iterators.py", line 509, in _chunk_iterator
distributed_main(cfg.distributed_training.device_id, main, cfg, kwargs)
File "/home/ubuntu/src/fairseq/fairseq/fairseq/distributed/utils.py", line 348, in distributed_main
for x in itr:main(cfg, **kwargs)

File "/home/ubuntu/src/fairseq/fairseq/fairseq/data/iterators.py", line 56, in __next__
File "../../train.py", line 190, in main
x = next(self._itr)
File "/home/ubuntu/src/fairseq/fairseq/fairseq/data/iterators.py", line 637, in __next__
valid_losses, should_stop = train(cfg, trainer, task, epoch_itr)
File "/home/ubuntu/anaconda3/envs/polyformer/lib/python3.7/contextlib.py", line 74, in inner
return func(*args, **kwds)
File "../../train.py", line 298, in train
raise item
File "/home/ubuntu/src/fairseq/fairseq/fairseq/data/iterators.py", line 567, in run
for i, samples in enumerate(progress):
File "/home/ubuntu/src/fairseq/fairseq/fairseq/logging/progress_bar.py", line 261, in __iter__
for i, obj in enumerate(self.iterable, start=self.n):
File "/home/ubuntu/src/fairseq/fairseq/fairseq/data/iterators.py", line 56, in __next__
for item in self._source:
File "/home/ubuntu/anaconda3/envs/polyformer/lib/python3.7/site-packages/torch/utils/data/dataloader.py", line 628, in __next__
x = next(self._itr)
File "/home/ubuntu/src/fairseq/fairseq/fairseq/data/iterators.py", line 509, in _chunk_iterator
for x in itr:
data = self._next_data() File "/home/ubuntu/src/fairseq/fairseq/fairseq/data/iterators.py", line 56, in __next__

File "/home/ubuntu/anaconda3/envs/polyformer/lib/python3.7/site-packages/torch/utils/data/dataloader.py", line 671, in _next_data
x = next(self._itr)
File "/home/ubuntu/src/fairseq/fairseq/fairseq/data/iterators.py", line 637, in __next__
data = self._dataset_fetcher.fetch(index) # may raise StopIteration
File "/home/ubuntu/anaconda3/envs/polyformer/lib/python3.7/site-packages/torch/utils/data/_utils/fetch.py", line 58, in fetch
raise item
File "/home/ubuntu/src/fairseq/fairseq/fairseq/data/iterators.py", line 567, in run
data = [self.dataset[idx] for idx in possibly_batched_index]
File "/home/ubuntu/anaconda3/envs/polyformer/lib/python3.7/site-packages/torch/utils/data/_utils/fetch.py", line 58, in <listcomp>
data = [self.dataset[idx] for idx in possibly_batched_index]
File "/home/ubuntu/workspaces/AIHub-polygon-transformer/data/refcoco_dataset.py", line 82, in __getitem__
for item in self._source:
File "/home/ubuntu/anaconda3/envs/polyformer/lib/python3.7/site-packages/torch/utils/data/dataloader.py", line 628, in __next__
data = self.dataset[index]
File "/home/ubuntu/workspaces/AIHub-polygon-transformer/data/file_dataset.py", line 111, in __getitem__
column_l = [dtype(column_l[col_id]) for col_id, dtype in zip(self.selected_col_ids, self.dtypes)]
File "/home/ubuntu/workspaces/AIHub-polygon-transformer/data/file_dataset.py", line 111, in <listcomp>
column_l = [dtype(column_l[col_id]) for col_id, dtype in zip(self.selected_col_ids, self.dtypes)]
data = self._next_data()
File "/home/ubuntu/anaconda3/envs/polyformer/lib/python3.7/site-packages/torch/utils/data/dataloader.py", line 671, in _next_data
IndexError: list index out of range
data = self._dataset_fetcher.fetch(index) # may raise StopIteration
File "/home/ubuntu/anaconda3/envs/polyformer/lib/python3.7/site-packages/torch/utils/data/_utils/fetch.py", line 58, in fetch
data = [self.dataset[idx] for idx in possibly_batched_index]
File "/home/ubuntu/anaconda3/envs/polyformer/lib/python3.7/site-packages/torch/utils/data/_utils/fetch.py", line 58, in <listcomp>
data = [self.dataset[idx] for idx in possibly_batched_index]
File "/home/ubuntu/workspaces/AIHub-polygon-transformer/data/refcoco_dataset.py", line 82, in __getitem__
data = self.dataset[index]
File "/home/ubuntu/workspaces/AIHub-polygon-transformer/data/file_dataset.py", line 111, in __getitem__
column_l = [dtype(column_l[col_id]) for col_id, dtype in zip(self.selected_col_ids, self.dtypes)]
File "/home/ubuntu/workspaces/AIHub-polygon-transformer/data/file_dataset.py", line 111, in <listcomp>
column_l = [dtype(column_l[col_id]) for col_id, dtype in zip(self.selected_col_ids, self.dtypes)]
IndexError: list index out of range
WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 1721037 closing signal SIGTERM
WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 1721039 closing signal SIGTERM
WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 1721040 closing signal SIGTERM
WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 1721041 closing signal SIGTERM
WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 1721043 closing signal SIGTERM
WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 1721044 closing signal SIGTERM
ERROR:torch.distributed.elastic.multiprocessing.api:failed (exitcode: 1) local_rank: 1 (pid: 1721038) of binary: /home/ubuntu/anaconda3/envs/polyformer/bin/python3
Traceback (most recent call last):
File "/home/ubuntu/anaconda3/envs/polyformer/lib/python3.7/runpy.py", line 193, in _run_module_as_main
"__main__", mod_spec)
File "/home/ubuntu/anaconda3/envs/polyformer/lib/python3.7/runpy.py", line 85, in _run_code
exec(code, run_globals)
File "/home/ubuntu/anaconda3/envs/polyformer/lib/python3.7/site-packages/torch/distributed/launch.py", line 195, in <module>
main()
File "/home/ubuntu/anaconda3/envs/polyformer/lib/python3.7/site-packages/torch/distributed/launch.py", line 191, in main
launch(args)
File "/home/ubuntu/anaconda3/envs/polyformer/lib/python3.7/site-packages/torch/distributed/launch.py", line 176, in launch
run(args)
File "/home/ubuntu/anaconda3/envs/polyformer/lib/python3.7/site-packages/torch/distributed/run.py", line 756, in run
)(*cmd_args)
File "/home/ubuntu/anaconda3/envs/polyformer/lib/python3.7/site-packages/torch/distributed/launcher/api.py", line 132, in __call__
return launch_agent(self._config, self._entrypoint, list(args))
File "/home/ubuntu/anaconda3/envs/polyformer/lib/python3.7/site-packages/torch/distributed/launcher/api.py", line 248, in launch_agent
failures=result.failures,
torch.distributed.elastic.multiprocessing.errors.ChildFailedError:
============================================================
../../train.py FAILED
------------------------------------------------------------
Failures:
[1]:
time : 2024-11-08_10:38:04
host : ip-172-31-11-84.us-west-2.compute.internal
rank : 5 (local_rank: 5)
exitcode : 1 (pid: 1721042)
error_file: <N/A>
traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
------------------------------------------------------------
Root Cause (first observed failure):
[0]:
time : 2024-11-08_10:38:04
host : ip-172-31-11-84.us-west-2.compute.internal
rank : 1 (local_rank: 1)
exitcode : 1 (pid: 1721038)
error_file: <N/A>
traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
============================================================
Loading

0 comments on commit cd4622a

Please sign in to comment.