-
Notifications
You must be signed in to change notification settings - Fork 18
/
Copy pathflightextract.py
172 lines (140 loc) · 5.3 KB
/
flightextract.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
import argparse
import numpy as np
import pandas as pd
from itertools import cycle
from matplotlib import pyplot as plt
from sklearn import preprocessing
from pymongo import MongoClient
from collections import OrderedDict
from sklearn.cluster import DBSCAN
# Constants
HOST = "localhost" # MongoDB host
PORT = 27017 # MongoDB port
MIN_DATA_SIZA = 100 # minimal number of data in a flight
CHUNK_SIZE = 50 # number of icaos to be processed in chunks
TEST_FLAG = True # if this is a test run
# get script arguments
parser = argparse.ArgumentParser()
parser.add_argument(
"--csv", dest="input_csv", required=True, help="decoded adsb csv file"
)
parser.add_argument("--db", dest="db", required=True)
parser.add_argument("--coll", dest="output_coll", help="Flight collection name")
args = parser.parse_args()
mdb = args.db
pos_csv = args.input_csv
flights_coll = args.output_coll
mongo_client = MongoClient(HOST, PORT)
if flights_coll:
TEST_FLAG = False
mcollflights = mongo_client[mdb][flights_coll]
mcollflights.drop() # clear the segment collection first
print("[1] Querying database.")
df = pd.read_csv(pos_csv)
df.drop_duplicates(subset=["ts"], inplace=True)
# Find all ICAO IDs in the dataset
dfcount = df.groupby("icao").size().reset_index(name="counts")
icaos = dfcount[dfcount.counts > 100].icao.tolist()
print("[2] %d number of valid ICAOs." % len(icaos))
for i in range(0, len(icaos), CHUNK_SIZE):
print("[3][%d-%d of %d] ICAOs beening processed." % (i, i + CHUNK_SIZE, len(icaos)))
chunk = icaos[i : i + CHUNK_SIZE]
print(" [a] fetching records")
dfchunk = df[df.icao.isin(chunk)]
ids = dfchunk["icao"].as_matrix()
lats = dfchunk["lat"].as_matrix()
lons = dfchunk["lon"].as_matrix()
alts = dfchunk["alt"].as_matrix()
spds = dfchunk["spd"].as_matrix()
hdgs = dfchunk["hdg"].as_matrix()
rocs = dfchunk["roc"].as_matrix()
times = dfchunk["ts"].as_matrix()
#####################################################################
# Continous fligh path extraction using machine learning algorithms
#####################################################################
print(" [b] data scaling")
# transform the text ids into numbers
# ------------------------------------
le = preprocessing.LabelEncoder()
encoded_ids = le.fit_transform(ids)
# Apply feature scaling - on altitude, spds, and times
# ------------------------------------------------------
mms = preprocessing.MinMaxScaler(feature_range=(0, 1000))
times_norm = mms.fit_transform(times.reshape((-1, 1)))
dt = mms.scale_ * 0.5 * 60 * 60 # time interval of 30 mins
mms = preprocessing.MinMaxScaler(feature_range=(0, 100))
alts_norm = mms.fit_transform(alts.reshape((-1, 1)))
print(" [c] creating machine learning dataset.")
# aggregate data by there ids
# ----------------------------
acs = {}
for i in range(len(ids)):
if ids[i] not in list(acs.keys()):
acs[ids[i]] = []
acs[ids[i]].append(
[
times_norm[i],
alts_norm[i],
times[i],
lats[i],
lons[i],
int(alts[i]),
spds[i],
hdgs[i],
rocs[i],
]
)
print(" [d] start clustering, and saving results to DB")
# Apply clustering method
# ------------------------
cluster = DBSCAN(eps=dt, min_samples=MIN_DATA_SIZA)
acsegs = {}
total = len(list(acs.keys()))
count = 0
print(" * processing ", end=" ")
for k in list(acs.keys()):
count += 1
print(".", end=" ")
data = np.asarray(acs[k])
tdata = np.copy(data)
tdata[:, 1:] = 0
cluster.fit(tdata)
labels = cluster.labels_
n_clusters = np.unique(labels).size
# print("n_clusters : %d" % n_clusters)
if not TEST_FLAG:
for i in range(n_clusters):
mask = labels == i
c = data[mask, 2:]
c = c[c[:, 0].argsort()] # sort by ts
if len(c) > MIN_DATA_SIZA:
mcollflights.insert_one(
OrderedDict(
[
("icao", k),
("ts", c[:, 0].tolist()),
("lat", c[:, 1].tolist()),
("lon", c[:, 2].tolist()),
("alt", c[:, 3].tolist()),
("spd", c[:, 4].tolist()),
("hdg", c[:, 5].tolist()),
("roc", c[:, 6].tolist()),
]
)
)
# Plot result
if TEST_FLAG:
colorset = cycle(["purple", "green", "red", "blue", "orange"])
for i, c in zip(list(range(n_clusters)), colorset):
mask = labels == i
ts = data[mask, 0].tolist()
alts = data[mask, 5].tolist()
if len(alts) > MIN_DATA_SIZA:
plt.plot(ts, alts, "w", color=c, marker=".", alpha=1.0)
plt.xlim([0, 1000])
plt.draw()
plt.waitforbuttonpress(-1)
plt.clf()
print("")
print()
print("[4] All completed")