-
Notifications
You must be signed in to change notification settings - Fork 0
/
inference_Sony_our.py
149 lines (114 loc) · 7.01 KB
/
inference_Sony_our.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
# Original code: https://github.com/cchen156/Learning-to-See-in-the-Dark
# uniform content loss + adaptive threshold + per_class_input + recursive G
# improvement upon cqf37
def main_fun(argv, ctx):
# this will be executed/imported on the executors.
import time
from datetime import datetime
import tensorflow as tf
import tensorflow.contrib.slim as slim
from tensorflowonspark import TFNode
import numpy as np
num_workers = len(ctx.cluster_spec['worker'])
worker_num = ctx.worker_num
job_name = ctx.job_name
task_index = ctx.task_index
# the cluster has no GPUs
cluster, server = TFNode.start_cluster_server(ctx, num_gpus=0)
# Create generator for Spark data feed
tf_feed = ctx.get_data_feed(argv.mode == 'train')
def rdd_generator():
while not tf_feed.should_stop():
batch = tf_feed.next_batch(1)
if len(batch) == 0:
return
row = batch[0]
input_patch = row[0]
yield (input_patch, np.random.rand(1024, 1024, 3))
if job_name == "ps":
server.join()
elif job_name == "worker":
with tf.device(tf.train.replica_device_setter(worker_device="/job:worker/task:%d" % task_index,
cluster=cluster)):
def lrelu(x):
return tf.maximum(x * 0.2, x)
def upsample_and_concat(x1, x2, output_channels, in_channels):
pool_size = 2
deconv_filter = tf.Variable(
tf.truncated_normal([pool_size, pool_size, output_channels, in_channels], stddev=0.02))
deconv = tf.nn.conv2d_transpose(x1, deconv_filter, tf.shape(x2), strides=[1, pool_size, pool_size, 1])
deconv_output = tf.concat([deconv, x2], 3)
deconv_output.set_shape([None, None, None, output_channels * 2])
return deconv_output
def network(input):
conv1 = slim.conv2d(input, 32, [3, 3], rate=1, activation_fn=lrelu, scope='g_conv1_1')
conv1 = slim.conv2d(conv1, 32, [3, 3], rate=1, activation_fn=lrelu, scope='g_conv1_2')
pool1 = slim.max_pool2d(conv1, [2, 2], padding='SAME')
conv2 = slim.conv2d(pool1, 64, [3, 3], rate=1, activation_fn=lrelu, scope='g_conv2_1')
conv2 = slim.conv2d(conv2, 64, [3, 3], rate=1, activation_fn=lrelu, scope='g_conv2_2')
pool2 = slim.max_pool2d(conv2, [2, 2], padding='SAME')
conv3 = slim.conv2d(pool2, 128, [3, 3], rate=1, activation_fn=lrelu, scope='g_conv3_1')
conv3 = slim.conv2d(conv3, 128, [3, 3], rate=1, activation_fn=lrelu, scope='g_conv3_2')
pool3 = slim.max_pool2d(conv3, [2, 2], padding='SAME')
conv4 = slim.conv2d(pool3, 256, [3, 3], rate=1, activation_fn=lrelu, scope='g_conv4_1')
conv4 = slim.conv2d(conv4, 256, [3, 3], rate=1, activation_fn=lrelu, scope='g_conv4_2')
pool4 = slim.max_pool2d(conv4, [2, 2], padding='SAME')
conv5 = slim.conv2d(pool4, 512, [3, 3], rate=1, activation_fn=lrelu, scope='g_conv5_1')
conv5 = slim.conv2d(conv5, 512, [3, 3], rate=1, activation_fn=lrelu, scope='g_conv5_2')
up6 = upsample_and_concat(conv5, conv4, 256, 512)
conv6 = slim.conv2d(up6, 256, [3, 3], rate=1, activation_fn=lrelu, scope='g_conv6_1')
conv6 = slim.conv2d(conv6, 256, [3, 3], rate=1, activation_fn=lrelu, scope='g_conv6_2')
up7 = upsample_and_concat(conv6, conv3, 128, 256)
conv7 = slim.conv2d(up7, 128, [3, 3], rate=1, activation_fn=lrelu, scope='g_conv7_1')
conv7 = slim.conv2d(conv7, 128, [3, 3], rate=1, activation_fn=lrelu, scope='g_conv7_2')
up8 = upsample_and_concat(conv7, conv2, 64, 128)
conv8 = slim.conv2d(up8, 64, [3, 3], rate=1, activation_fn=lrelu, scope='g_conv8_1')
conv8 = slim.conv2d(conv8, 64, [3, 3], rate=1, activation_fn=lrelu, scope='g_conv8_2')
up9 = upsample_and_concat(conv8, conv1, 32, 64)
conv9 = slim.conv2d(up9, 32, [3, 3], rate=1, activation_fn=lrelu, scope='g_conv9_1')
conv9 = slim.conv2d(conv9, 32, [3, 3], rate=1, activation_fn=lrelu, scope='g_conv9_2')
conv10 = slim.conv2d(conv9, 12, [1, 1], rate=1, activation_fn=None, scope='g_conv10')
out = tf.depth_to_space(conv10, 2)
return out
ds = tf.data.Dataset.from_generator(rdd_generator, (tf.float32, tf.float32), (
tf.TensorShape([None, None, 4]), tf.TensorShape([None, None, 3]))).batch(1)
iterator = ds.make_one_shot_iterator()
in_image, gt_image = iterator.get_next()
out_image = network(in_image)
global_step = tf.train.get_or_create_global_step()
G_loss = tf.reduce_mean(tf.abs(out_image - gt_image))
t_vars = tf.trainable_variables()
lr = tf.placeholder(tf.float32)
G_opt = tf.train.AdamOptimizer(learning_rate=lr).minimize(G_loss, global_step=global_step)
saver = tf.train.Saver()
init_op = tf.global_variables_initializer()
# Create a "supervisor", which oversees the training process and stores model state into HDFS
logdir = ctx.absolute_path(argv.model)
print("tensorflow model path: {0}".format(logdir))
with tf.train.MonitoredTrainingSession(master=server.target,
is_chief=(task_index == 0),
scaffold=tf.train.Scaffold(init_op=init_op, saver=saver),
checkpoint_dir=logdir,
hooks=[]) as sess:
print("{} session ready".format(datetime.now().isoformat()))
while not sess.should_stop() and not tf_feed.should_stop():
output = sess.run(out_image)
output = np.minimum(np.maximum(output, 0), 1)
tf_feed.batch_results(output)
print("{} stopping MonitoredTrainingSession".format(datetime.now().isoformat()))
if sess.should_stop():
tf_feed.terminate()
# WORKAROUND FOR https://github.com/tensorflow/tensorflow/issues/21745
# wait for all other nodes to complete (via done files)
done_dir = "{}/{}/done".format(ctx.absolute_path(argv.model), argv.mode)
print("Writing done file to: {}".format(done_dir))
tf.gfile.MakeDirs(done_dir)
with tf.gfile.GFile("{}/{}".format(done_dir, ctx.task_index), 'w') as done_file:
done_file.write("done")
for i in range(60):
if len(tf.gfile.ListDirectory(done_dir)) < len(ctx.cluster_spec['worker']):
print("{} Waiting for other nodes {}".format(datetime.now().isoformat(), i))
time.sleep(1)
else:
print("{} All nodes done".format(datetime.now().isoformat()))
break