-
Notifications
You must be signed in to change notification settings - Fork 0
/
extractROIs.py
127 lines (84 loc) · 3.63 KB
/
extractROIs.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
"""
To be used on the csv file generated from zooniverse
Currently assumes the zooniverse data and meta data of our annotations are in the same directory as this script
"""
import pandas as pd
import json
CSV_PATH = "rawData.csv"
# WARNING CSV file is inconsistent with the key identifier for the key names. "filename" and "0001_R.png" are valid key names
def extractROIs(csv_file_path):
assert csv_file_path
df = pd.read_csv(csv_file_path, usecols=["annotations", "subject_data","subject_ids"])
fileNames = []
coordinates = []
# gets all fileNames of images in rows that use "filename" as the key
for index, row in df.iterrows():
s_id = str(row["subject_ids"])
s_data = json.loads(row["subject_data"])
#
fn = None
try: # if this succeeds, then we can retrieve its annotations as well
fn = s_data[s_id]["filename"]
except:
try:
fn = s_data[s_id]["uclaclark_Q143B7S3_0121.png"]
except:
fn = s_data[s_id]["0001_R.png"]
fileNames.append(fn)
tasks = json.loads(df.iloc[index]["annotations"])
imageCoordinates = []
for t in tasks:
if t["task"] == "T1" or t["task"] == "T4":
listOfCoordinates = t["value"]
for coord in listOfCoordinates:
formattedCoord = int(coord["x"]), int(coord["y"]), int(coord["width"]), int(coord["height"])
imageCoordinates.append(formattedCoord)
coordinates.append(imageCoordinates)
duplicatedRegionData = list(zip(fileNames,coordinates))
d = dict()
# Some of the annotations that zooniverse users made did not fit with our definition of an annotation (e.g. a stray mark)
falsePositives = []
region_count = 0
for pair in duplicatedRegionData:
# skin unannotated regions
if pair[0] in falsePositives or not pair[1]:
continue
if pair[0] in d:
[d[pair[0]].append(r) for r in pair[1]]
else:
d[pair[0]] = pair[1]
for rl in d.values():
for r in rl:
region_count += 1
print("There are {} regions of interest".format(region_count))
return d
def convertToMaskRCNN(regionImageData: dict):
# apparently zooniverse allowed people to annotate beyond the actual image dimensions
maxWidth = 999
# the actual JSON
regionDataFormatted = dict()
for img,regions in regionImageData.items():
regionDataFormatted[img] = dict()
regionValue = dict() # the region is itself a dictionary
for i in range(len(regions)):
someD = dict()
# apparently zooniverse allowed people to annotate beyond the actual image width
x1 = max(1, regions[i][0])
x2 = min(x1 + regions[i][2], maxWidth)
y1 = regions[i][1]
y2 = y1 + regions[i][3]
# four (x,y) points are needed to create a bounding box
someD["shape_attributes"] = {"name" : "polygon", "all_points_x": [x1,x2,x2,x1], "all_points_y": [y1,y1,y2,y2]}
regionValue[str(i)] = someD
regionDataFormatted[img]["filename"] = img
regionDataFormatted[img]["regions"] = regionValue
return regionDataFormatted
regionData = extractROIs(CSV_PATH)
regionDataFormatted = convertToMaskRCNN(regionData)
with open('data.json', 'w') as dataFile:
json.dump(regionDataFormatted, dataFile)
print(regionDataFormatted)
# to see a text file of the json
# with open('formattedData.txt', 'w') as f:
# for item in regionDataFormatted.items():
# f.write("{}\n".format(item))