-
Notifications
You must be signed in to change notification settings - Fork 0
/
Chicago_WNV_figures.py
503 lines (383 loc) · 17.9 KB
/
Chicago_WNV_figures.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
# -*- coding: utf-8 -*-
"""
Created on Tue Oct 1 10:56:24 2024
https://data.cityofchicago.org/Health-Human-Services/West-Nile-Virus-WNV-Mosquito-Test-Results/jqe8-8r6s/about_data
@author: Dr.Potamitis Ilyas
"""
import matplotlib.pyplot as plt
import pandas as pd
import os
#import scipy
#from scipy.stats import hypergeom
#import datetime
import numpy as np
os.chdir('C:/python')
os.getcwd()
file_path = 'Chicago_WNV.xlsx'
df = pd.read_excel(file_path)
'''
Index(['SEASON YEAR', 'WEEK', 'TEST ID', 'BLOCK', 'TRAP', 'TRAP_TYPE',
'TEST DATE', 'NUMBER OF MOSQUITOES', 'RESULT', 'SPECIES',
'COMMUNITY AREA NUMBER', 'COMMUNITY AREA NAME', 'LATITUDE',
'LONGITUDE'],
dtype='object')
'''
# Update TRAP information with the community area details found online so that we dont delete the usefull data
trap_data_update = {
"T249": {
"COMMUNITY AREA NUMBER": 7,
"COMMUNITY AREA NAME": "Lincoln Park",
"LATITUDE": 41.9227,
"LONGITUDE": -87.6536
},
"T240B": {
"COMMUNITY AREA NUMBER": 7,
"COMMUNITY AREA NAME": "Lincoln Park",
"LATITUDE": 41.9227,
"LONGITUDE": -87.6536
},
"T143": {
"COMMUNITY AREA NUMBER": 10,
"COMMUNITY AREA NAME": "Norwood Park",
"LATITUDE": 41.9959,
"LONGITUDE": -87.8181
}
}
# Iterate over the trap data to update the relevant rows
for trap, data in trap_data_update.items():
mask = df['TRAP'] == trap
for key, value in data.items():
df.loc[mask, key] = value
# Dropping all rows with missing values in the 'RESULT' column
# Drop rows with missing values in the specified columns
columns_to_check = ['RESULT', 'COMMUNITY AREA NUMBER', 'COMMUNITY AREA NAME', 'LATITUDE', 'LONGITUDE']
df = df.dropna(subset=columns_to_check)
missing_values_per_column = df.isna().sum()
# Displaying the first few rows of the cleaned dataset
df.info()
df.rename(columns={
'TEST DATE': 'Date',
'SPECIES': 'Species',
'BLOCK': 'Address',
'COMMUNITY AREA NUMBER': 'Block',
'TRAP': 'Trap',
'LATITUDE': 'Latitude',
'LONGITUDE': 'Longitude',
'NUMBER OF MOSQUITOES': 'NumMosquitos',
'RESULT': 'WnvPresent',
'SEASON YEAR': 'year',
'WEEK': 'week'
}, inplace=True)
# Convert the 'Date' column to datetime format and handle missing values
df['Date'] = pd.to_datetime(df['Date'], errors='coerce')
# Drop rows where 'Date' could not be converted
df.dropna(subset=['Date'], inplace=True)
# Format the 'Date' column in the desired format '%d/%m/%Y'
df['Date'] = df['Date'].dt.strftime('%d/%m/%Y')
# Set 'Date' as the index
df.set_index('Date', inplace=True)
# Sort the dataframe by the index (date)
df.sort_index(inplace=True)
# Extract month and day from the index (which is the Date column) into separate columns
df['month'] = pd.to_datetime(df.index, format='%d/%m/%Y').month
df['day'] = pd.to_datetime(df.index, format='%d/%m/%Y').day
# round GPS coordinates for heatmap
df.Latitude = df.Latitude.round(3)
df.Longitude = df.Longitude.round(3)
# Each row is a batch
df.WnvPresent.sum()/len(df)*100 # percentage of positive batches
df.Trap.nunique() # unique traps
df[df.NumMosquitos>50].NumMosquitos # Each batch has maximum 50 insects. Two cases 2014-08-07 77, 2022-08-04 61
# Filter the data to include only rows with the specified TRAP_TYPES
filtered_df = df[df['TRAP_TYPE'].isin(['GRAVID', 'CDC', 'OVI', 'SENTINEL'])]
# Count unique traps for each TRAP_TYPE
unique_traps_per_type = filtered_df.groupby('TRAP_TYPE')['Trap'].nunique()
# Display how many trap types we have
unique_traps_per_type
# Plot 1: Histogram of 'year' where 'WnvPresent' is 1
plt.figure(figsize=(10, 6))
df[df['WnvPresent'] == 1]['year'].hist(bins=len(df['year'].unique()), color='darkblue')
plt.title('Distribution of WNV positive batches by Year', fontsize=16)
plt.xlabel('Year', fontsize=12)
plt.ylabel('Counts', fontsize=12)
plt.xticks(sorted(df['year'].unique())) # Ensure the years are displayed in sorted order on x-axis
plt.grid(True)
plt.show()
# Table 1
# Create a summary table with Species as rows and two columns: NumMosquitos and WnvPresent
species_summary = df.groupby('Species').agg(
Total_Mosquitos=('NumMosquitos', 'sum'),
Total_WnvPresent=('WnvPresent', 'sum')
).sort_values(by='Total_Mosquitos', ascending=False).reset_index()
# Print the summary table
print(species_summary)
#species_summary.to_excel('species_summary.xlsx', index=False)
# Plot 2
# Plotting the mosquito species trends over time
# Group the data by 'year' and 'Species' to analyze the mosquito catches trends over time
# Grouping data by year and species and summing mosquito counts
species_over_time = df.groupby(['year', 'Species'])['NumMosquitos'].sum().unstack().fillna(0)
# Plotting the mosquito species trends over time with wider lines and larger markers
plt.figure(figsize=(12, 8))
species_over_time.plot(kind='line', figsize=(12, 8), marker='o', linewidth=2.5, markersize=8)
plt.title('Mosquito Species Trends Over Time')
plt.xlabel('Year')
plt.ylabel('Total Number of Mosquitos')
plt.xticks(ticks=species_over_time.index, labels=species_over_time.index.astype(int))
plt.grid(True)
# Adding the legend inside the figure
plt.legend(title='Species', loc='upper right', fontsize=10, frameon=True)
plt.tight_layout()
plt.show()
# Plot 3: Histogram of 'week' where 'WnvPresent' is 1
# Mapping of weeks to approximate month labels (assuming each month has roughly 4 weeks)
weeks_to_months = {
1: 'Jan', 5: 'Feb', 9: 'Mar', 13: 'Apr', 17: 'May', 21: 'Jun',
25: 'Jul', 29: 'Aug', 33: 'Sep', 37: 'Oct', 41: 'Nov', 45: 'Dec'
}
# Plot 2: Histogram of 'week' where 'WnvPresent' is 1
fig, ax1 = plt.subplots(figsize=(10, 6))
# Main plot (weeks on the x-axis)
df[df['WnvPresent'] == 1]['week'].hist(bins=52, color='darkgreen', ax=ax1)
ax1.set_title('Distribution of WNV positive cases by Week', fontsize=16)
ax1.set_xlabel('Week of the Year', fontsize=12)
ax1.set_ylabel('Frequency', fontsize=12)
ax1.set_xticks(range(1, 53, 4)) # Tick every 4 weeks on the x-axis
ax1.grid(True)
# Create a secondary x-axis to display month names
ax2 = ax1.twiny() # Create a twin Axes that shares the same y-axis
ax2.set_xlim(ax1.get_xlim()) # Match the limits of the x-axis
ax2.set_xticks(list(weeks_to_months.keys())) # Set the positions for the months
ax2.set_xticklabels(list(weeks_to_months.values())) # Set the month labels
ax2.set_xlabel('Month', fontsize=12)
plt.show()
# Plot 4
# Calculate the total number of samples for each TRAP_TYPE and Species
unique_traps_per_type = df.groupby('TRAP_TYPE')['Trap'].nunique()
total_mosquitoes = df.groupby(['TRAP_TYPE', 'Species'])['NumMosquitos'].sum().unstack(fill_value=0)
normalized_mosquitoes = total_mosquitoes.div(unique_traps_per_type, axis=0).fillna(0)
# Plot the data as parallel bars
ax = normalized_mosquitoes.plot(kind='bar', figsize=(12, 6), colormap='tab20', width=0.8)
# Set labels and title
plt.xlabel('Trap Type', fontsize=12)
plt.ylabel('Average Number of Mosquitoes per Trap', fontsize=12)
plt.title('Effectiveness of Trap Types in Catching Mosquitoes by Species (Normalized by Number of Traps)', fontsize=16)
# Adjust the legend position
plt.legend(title='Species', loc='upper right', bbox_to_anchor=(0.75, 1))
# Rotate x-axis labels for better readability
plt.xticks(rotation=0, ha='right')
# Add grid lines for better readability
plt.grid(axis='y', linestyle='--', alpha=0.7)
# Adjust layout to prevent clipping of tick-labels
plt.tight_layout()
# Show the plot
plt.show()
# Plot 5 - Adjusted for Fair Comparison
# Calculate the total number of samples for each TRAP_TYPE and Species
unique_traps_per_type = df.groupby('TRAP_TYPE')['Trap'].nunique()
total_mosquitoes = df.groupby(['TRAP_TYPE', 'Species'])['WnvPresent'].sum().unstack(fill_value=0)
normalized_mosquitoes = total_mosquitoes.div(unique_traps_per_type, axis=0).fillna(0)
# Plot the data as parallel bars
ax = normalized_mosquitoes.plot(kind='bar', figsize=(12, 6), colormap='tab20', width=0.8)
# Set labels and title
plt.xlabel('Trap Type', fontsize=12)
plt.ylabel('Proportion of WnvPresent Cases (Normalized)', fontsize=12)
plt.title('Effectiveness of Trap Types in Catching Species When WnvPresent (Normalized by Number of Traps)', fontsize=16)
# Place legend inside the figure
plt.legend(title='Species', loc='upper right', bbox_to_anchor=(.75, 1))
# Rotate x-axis labels for better readability
plt.xticks(rotation=0, ha='right')
# Add grid lines for better readability
plt.grid(axis='y', linestyle='--', alpha=0.7)
# Adjust layout to prevent clipping of tick-labels
plt.tight_layout()
# Show the plot
plt.show()
###########
# Plot 6
# Grouping by 'Trap' and aggregating 'WnvPresent' and 'NumMosquitos'
trap_grouped = df.groupby('Trap').agg({
'WnvPresent': 'sum', # Total count of WnvPresent
'NumMosquitos': 'sum' # Total number of mosquitoes
}).reset_index()
# Merging trap_grouped with additional information (Location and Trap type) from the original dataframe
top_traps_info = df[['Trap', 'Address', 'TRAP_TYPE']].drop_duplicates()
trap_grouped = pd.merge(trap_grouped, top_traps_info, on='Trap', how='left')
# Sorting by 'WnvPresent' and selecting the top 10 traps
top_traps = trap_grouped.sort_values(by='WnvPresent', ascending=False).head(10)
# Creating a table with Trap ID, Location, and Trap type
top_traps_table = top_traps[['Trap', 'Address', 'TRAP_TYPE']].reset_index()
print(top_traps_table)
# Plotting
fig, ax1 = plt.subplots(figsize=(10, 6))
# Bar width and positions
bar_width = 0.4
index = range(len(top_traps))
# Plotting the 'WnvPresent' counts on the left y-axis
ax1.bar(index, top_traps['WnvPresent'], width=bar_width, color='darkblue', label='WnvPresent Count')
ax1.set_xlabel('Trap #', fontsize=12)
ax1.set_ylabel('WnvPresent Count', fontsize=12, color='darkblue')
ax1.tick_params(axis='y', labelcolor='darkblue')
# Setting x-ticks and labels
ax1.set_xticks([i for i in index])
ax1.set_xticklabels(top_traps['Trap'], rotation=45, ha='right')
# Creating the secondary y-axis for 'NumMosquitos'
ax2 = ax1.twinx()
ax2.bar([i + bar_width for i in index], top_traps['NumMosquitos'], width=bar_width, color='darkgreen', label='NumMosquitos Count')
ax2.set_ylabel('NumMosquitos Count', fontsize=12, color='darkgreen')
ax2.tick_params(axis='y', labelcolor='darkgreen')
# Adding a title
plt.title('Top 10 Traps sorted by WNV positive cases and #Mosquitos', fontsize=16)
# Adjust layout for readability
plt.tight_layout()
plt.show()
top_traps_table.to_excel('effective_traps.xlsx', index=False)
# Plot 7: Create a figure for the histogram
# Group by 'COMMUNITY AREA NAME' to count the number of WNV-positive cases per community area
wnv_by_community = df[df['WnvPresent'] == 1].groupby('COMMUNITY AREA NAME').size().sort_values(ascending=False)
# Plot the data
plt.figure(figsize=(12, 8))
wnv_by_community.plot(kind='bar')
# Set title and labels
plt.title('WNV-Positive Cases by Community Area', fontsize=14)
plt.xlabel('Community Area', fontsize=12)
plt.ylabel('Number of WNV-Positive Cases', fontsize=12)
# Rotate the x-axis labels for readability
plt.xticks(rotation=90)
# Show the plot
plt.tight_layout()
plt.grid()
plt.show()
# Plot 8: Heatmaps
import folium
from folium.plugins import HeatMap
import matplotlib.pyplot as plt
from scipy.spatial import ConvexHull
from scipy.stats import norm
# Step 1: Create a Heatmap with Folium
m = folium.Map(location=[df['Latitude'].mean(), df['Longitude'].mean()], zoom_start=12)
# Prepare heat data
try:
heat_data = [[row['Latitude'], row['Longitude'], row['WnvPresent']] for index, row in df.iterrows()]
HeatMap(heat_data).add_to(m)
except KeyError as e:
print(f"KeyError: {e}. Ensure your DataFrame contains the required columns.")
# Save the map to an HTML file
m.save("heatmap.html")
# Step 2: Visualizing Isoperimetric Regions with Matplotlib
points = df[['Longitude', 'Latitude']].values
hull = ConvexHull(points)
plt.figure(figsize=(10, 6))
# Use a more distinctive way to represent 'WnvPresent' by modifying the marker style and scaling the sizes
sizes = np.where(df['WnvPresent'] > 0, df['WnvPresent'] * 50, 10) # Larger sizes for WnvPresent > 0, default size otherwise
colors = np.where(df['WnvPresent'] > 0, 'red', 'blue') # Different color for WnvPresent
plt.scatter(df['Longitude'], df['Latitude'], c=colors, s=sizes, alpha=0.7, edgecolor='k')
for simplex in hull.simplices:
plt.plot(points[simplex, 0], points[simplex, 1], 'r-', lw=2)
plt.xlabel('Longitude')
plt.ylabel('Latitude')
plt.title('Heatmap with Isoperimetric Regions')
plt.grid()
plt.show()
# Plot 9. Visualize outbreaks
df.index = pd.to_datetime(df.index, dayfirst=True)
# We consider an outbreak as a period with 3 or more consecutive weeks of WnvPresent >= 1
weekly_wnv = df['WnvPresent'].resample('W').sum()
outbreak_threshold = 1 # Minimum number of WnvPresent occurrences for a week to be considered
consecutive_weeks = 3 # Minimum number of consecutive weeks to classify as an outbreak
# Create a mask where WnvPresent >= outbreak_threshold
outbreak_mask = weekly_wnv >= outbreak_threshold
# Find the start and end indices of outbreaks
outbreaks = []
start_idx = None
for i in range(len(outbreak_mask)):
if outbreak_mask.iloc[i] and start_idx is None:
start_idx = i
elif not outbreak_mask.iloc[i] and start_idx is not None:
if i - start_idx >= consecutive_weeks:
outbreaks.append((start_idx, i - 1))
start_idx = None
# If the outbreak goes till the end of the data
if start_idx is not None and len(outbreak_mask) - start_idx >= consecutive_weeks:
outbreaks.append((start_idx, len(outbreak_mask) - 1))
# Step 4: Plot the data with outbreaks highlighted
plt.figure(figsize=(12, 6))
plt.plot(weekly_wnv.index, weekly_wnv.values, color='blue', label='Weekly WnvPresent Count')
# Highlight outbreaks
for start, end in outbreaks:
plt.axvspan(weekly_wnv.index[start], weekly_wnv.index[end], color='red', alpha=0.3, label='Outbreak' if start == outbreaks[0][0] else "")
# Set x-axis ticks manually for each year in the data range
years = pd.date_range(start=weekly_wnv.index.min(), end=weekly_wnv.index.max(), freq='YS')
plt.xticks(years, [year.year for year in years], rotation=90)
# Add labels, title, and legend
plt.xlabel('Date', fontsize=12)
plt.ylabel('WnvPresent Count', fontsize=12)
plt.title('WNV Outbreaks Over Time', fontsize=16)
plt.legend()
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()
# Plot 10: Histogram of 'NumMosquitos' where 'WnvPresent' is 1
# Use default style (white background)
plt.style.use('default')
plt.figure(figsize=(10, 6))
df[df['WnvPresent'] == 1]['NumMosquitos'].hist(bins=20, color='darkblue')
plt.title('Distribution of Mosquitos in batches (when positive for WNV)', fontsize=16)
plt.xlabel('Number of Mosquitos', fontsize=12)
plt.ylabel('Counts', fontsize=12)
plt.legend(['NumMosquitos'])
plt.grid(True)
plt.show()
# Plot 11
#Calculate number of days since Aug 1 using Pandas' vectorized operations
aug1 = pd.to_datetime(df['year'].astype(str) + '-08-01')
df['DaysSinceAug1'] = (pd.to_datetime(df.index, dayfirst=True) - aug1).dt.days # Correct the subtraction by converting index to datetime
# Calculate number of days since Aug 1 using Pandas' vectorized operations
# Count occurrences of WnvPresent
days_present_counts = df.groupby('DaysSinceAug1')['WnvPresent'].sum()
# Create a new DataFrame to hold the counts
days_df = pd.DataFrame({'DaysSinceAug1': days_present_counts.index, 'WnvPresent': days_present_counts.values})
# Fit a Gaussian distribution to the WnvPresent data
mean = 15
std_dev = days_df['DaysSinceAug1'].std()
# Create an array of x values for the Gaussian curve
x = np.linspace(days_df['DaysSinceAug1'].min(), days_df['DaysSinceAug1'].max(), 100)
# Calculate the Gaussian PDF values
gaussian_pdf = norm.pdf(x, mean, std_dev) * (days_df['WnvPresent'].max() * 0.05) # Scale for visibility
# Plotting
plt.figure(figsize=(10, 6))
plt.bar(days_df['DaysSinceAug1'], days_df['WnvPresent'], color='darkblue', edgecolor='black', alpha=0.7, label='Wnv Present Count')
plt.plot(x, 800*gaussian_pdf, color='red', linewidth=2, label='Fitted Gaussian PDF')
# Add labels and title
plt.xlabel('Days Since August 1', fontsize=12)
plt.ylabel('Count of Wnv Present', fontsize=12)
plt.title('Distribution of Wnv Present by Days Since August 1 with Gaussian Fit (2007-2024)', fontsize=12)
plt.grid(axis='y', linestyle='--', alpha=0.7)
# Add legend
plt.legend()
plt.show()
# Plot 12 in bivariate.py
# Table 3
# Approximating the number of mosquitos per trap based on multirow count:
# Group by 'Species', 'Trap', 'Address', 'Date'
grouped = df.groupby(['Species', 'Trap', 'Address', 'Date'])
# Get the count of each unique combination of 'Species', 'Trap', 'Address', 'Date'
counts = grouped.size().reset_index(name='count')
# Merge the counts back into the original DataFrame
train_with_counts = pd.merge(df, counts, on=['Species', 'Trap', 'Address', 'Date'])
# Create a list of how many times each combination occurs for each c in range(30)
combination_counts = [len(train_with_counts[train_with_counts['count'] == c]) for c in range(1,30)]
# Calculate the mean 'NumMosquitos' for each unique combination that occurs c times
mean_num_mosquitos = [train_with_counts.loc[train_with_counts['count'] == c, 'NumMosquitos'].mean() for c in range(1,30)]
# Data for the table
data = {
'Multirow Count': list(range(1,14)),
'# cases': combination_counts[:13],
'Mean # Mosquitos': mean_num_mosquitos[:13]
}
# Creating the DataFrame
df_table = pd.DataFrame(data)
df_table['Mean # Mosquitos'] = df_table['Mean # Mosquitos'].round(1)
# Display the table
print(df_table.to_string(index=False))
# Figure 13 is created in the file classification_Chicago_WNV.py