-
Notifications
You must be signed in to change notification settings - Fork 3
/
analysisActivityDistribution.py
282 lines (238 loc) · 12.1 KB
/
analysisActivityDistribution.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
#############################################################################################
# SCRIPT INFORMATION
#############################################################################################
# LICENSE INFORMATION:
# ---------------------
# analysis.py
# Authors: Jérémy Bonvoisin, Jonas Massmann
# Homepage: http://opensourcedesign.cc
# License: GPL v.3
# PREREQUISITES:
# ---------------
# - a directory with graphml files containing the description of commits as given by the github API.
# Files names are formatted as follows <projectname>.committers.<series>.graphml
# - non standard libraries (install with <libraryName>):
# . NetworkX (https://networkx.github.io/documentation/stable/reference/index.html)
# . matplotlib
#
# ARGUMENTS:
# -----------
# see help() function
#############################################################################################
# HEADER
#############################################################################################
# standard python libraries
import os
import csv
from getopt import getopt, GetoptError
import matplotlib.pyplot as plt
import networkx as nx
from sys import stdout, exit, argv
#############################################################################################
# FUNCTION help
#############################################################################################
def help():
print('Required Arguments:')
print('-i --input <path> path of the directory where the GraphML files are stored')
print('-o --output <path> path of the directory where the figures should be stored')
print('Optional Arguments:')
print('-w --weights use edge weigts for clustering coefficients')
print('-c --clearscreen clear sceen before processing')
print('-h --help calls help function')
exit()
#############################################################################################
# FUNCTION loadGraphMLs
#############################################################################################
def loadGraphMLs(inputDir, extension):
graphs = []
GraphMLFiles = [f for f in os.listdir(inputDir) if os.path.isfile(os.path.join(inputDir, f)) and f.lower().endswith(extension)]
numberOfGraphMLFiles = len(GraphMLFiles)
print(str(numberOfGraphMLFiles) + ' files found ending with "' + extension + '"')
for file,i in zip(GraphMLFiles,range(0,len(GraphMLFiles))):
graphs.append(nx.read_graphml(os.path.join(inputDir, file)))
# processbar
stdout.write('\r')
stdout.write("[%-30s] %d%%" % ('=' * int(i*30/len(GraphMLFiles)+1), i*100/len(GraphMLFiles)+1))
stdout.flush()
print('\r')
return graphs
#############################################################################################
# FUNCTION computeIndicators
#############################################################################################
def computeIndicators(committerGraphs, repoReferences, useWeights=False):
projectNames = [] # to host project names
committersNumbers = [] # to host numbers of committers
completenessIndex = [] # to host graph completeness values
centralizationIndex = [] # to host the graph centrality values
clusteringIndex = [] # to host the graph modularity values
averagePLs = [] # to host average path lenghts
diameters = [] # to host diameters
#########################################################################################
## Analyse the number of edges in respect to the number of committers ###################
#########################################################################################
#indicators are only available for unidirected graphs
for committerGraph, file in zip(committerGraphs, repoReferences):
# convert directed graph into unidirected graph
if nx.is_directed(committerGraph):
committerGraph_undirected = nx.to_undirected(committerGraph)
committerGraph_undirected = nx.Graph(committerGraph_undirected)
else:
committerGraph_undirected = committerGraph
# add project to the list and get the corresponding number of committers
projectNames.append(file)
numberOfCommitters = nx.number_of_nodes(committerGraph_undirected) # number of committers in this project
committersNumbers.append(numberOfCommitters)
# Completeness
# ----------
# if there is more than one committer, we can calculate a completeness
if numberOfCommitters > 1:
# calculate the completeness of the graph, that is, the position in the scale between:
# - 0 edge (the graph is entirely disconnected)
# - n*(n-1), where n is the number of nodes (each node is connected with all other nodes)
scaleMin = 0
scaleMax = len(committerGraph_undirected)*(len(committerGraph_undirected)-1)/2
completenessIndex.append((nx.number_of_edges(committerGraph_undirected)-scaleMin)/(scaleMax-scaleMin))
else:
completenessIndex.append(float('nan')) # no Completeness value can be calculated
'''
Centrality
----------
after "Social Network Analysis: Methods and Applications", Stanley Wasserman, Katherine Faust
--> Degree is the number of nodes that a focal node is connected to,
and measures the involvement of the node in the network
https://books.google.de/books?id=CAm2DpIqRUIC&printsec=frontcover&redir_esc=y#v=onepage&q=Centrality&f=false
https://cs.brynmawr.edu/Courses/cs380/spring2013/section02/slides/05_Centrality.pdf
'''
# get degree of every node
degree = nx.degree(committerGraph_undirected)
# By Freeman (1977) definition:
if ((numberOfCommitters-1)*(numberOfCommitters-2)) is not 0 and len(degree) is not 0:
# maximum degree
c_star = max([x[1] for x in degree])
av = sum([c_star-abs(c_a) for c_a in [x[1] for x in degree]])/((numberOfCommitters-1)*(numberOfCommitters-2))
centralizationIndex.append(av)
else:
centralizationIndex.append(float('nan')) # no Centrality value can be calculated
# Modularity / Clustering
# ----------
# after "Generalizations of the clustering coefficient to weighted complex networks",
# J. Saramäki, M. Kivelä, J.-P. Onnela, K. Kaski, and J. Kertész,
# Physical Review E, 75 027105 (2007).
# http://jponnela.com/web_documents/a9.pdf
# Corresponding Networkx function:
# https://networkx.github.io/documentation/networkx-1.10/reference/generated/networkx.algorithms.cluster.clustering.html?highlight=clustering#networkx.algorithms.cluster.clustering
# Note: Self loops are ignored in nx.clustering function!
if useWeights:
clustering_coefficients = nx.clustering(committerGraph_undirected, weight='weight')
else:
clustering_coefficients = nx.clustering(committerGraph_undirected)
# average over all clustering_coefficients
if len(clustering_coefficients) is not 0:
clusteringIndex.append(
sum(clustering_coefficients.values())/len(clustering_coefficients))
else:
clusteringIndex.append(float('nan')) # no Modularity value can be calculated
try:
averagePLs.append(nx.average_shortest_path_length(committerGraph_undirected))
except nx.exception.NetworkXError as err:
averagePLs.append('nan')
except nx.exception.NetworkXPointlessConcept as err:
averagePLs.append('nan')
try:
diameters.append(nx.diameter(committerGraph_undirected))
except nx.exception.NetworkXError as err:
diameters.append('nan')
except ValueError as err:
diameters.append('nan')
return committersNumbers, completenessIndex, centralizationIndex, clusteringIndex, averagePLs, diameters
###################################################################################################################
# BODY
###################################################################################################################
#initialisation
#####################
# declare global variables
global inputDir, outputDir
# get command line arguments
try:
options, remainder = getopt(argv[1:], 'i:o:chw', ['input=', 'output=','', 'clearscreen', 'help'])
except GetoptError as err:
print(str(err))
exit(2)
# initialise the parameters to be found in the arguments
inputDir = ''
outputDir = ''
clearscreen = False
useWeights = False
# search the parameters in the arguments given to the script
for option, argument in options:
if option in ('-i','--input'):
inputDir = argument
elif option in ('-o','--output'):
outputDir = os.path.relpath(argument)
elif option in ('-c','--clearscreen'):
clearscreen = True
elif option in ('-w','--weights'):
useWeights = True
print('*use edge weights for caluclation of clustering coefficients*')
elif option in ("-h", "--help"):
help()
# check whether all required parameters have been given as arguments and if not throw exception and abort
if inputDir == '':
print("Argument required: input directory. Type '-i <directory path>' in the command line")
exit(2)
if outputDir == '':
print("Argument required: output directory. Type '-o <directory path>' in the command line")
exit(2)
# check whether the output folder exist and create it if not
if not os.path.exists(outputDir):
os.makedirs(outputDir)
# execute options chosen by the user
if clearscreen:
os.system('cls')
# define the series of files to look at and analyze
seriesNames = ['committers.all.graphml', 'committers.phw.graphml', 'committers.chw.graphml']
repoReferences = [ \
os.path.splitext(os.path.splitext(os.path.splitext(f)[0])[0])[0]
for f in os.listdir(inputDir) \
if os.path.isfile(os.path.join(inputDir, f)) and f.lower().endswith(seriesNames[0].lower())]
for serie in seriesNames:
columnsToExport = [repoReferences]
#load required graphs
loadedGraph = loadGraphMLs(inputDir, serie)
if len(loadedGraph) == len(repoReferences):
nrCommitters, completeness, centralization_index, network_clustering, averagePLs, diameters = \
computeIndicators(loadedGraph, repoReferences, useWeights)
columnsToExport.append(nrCommitters)
columnsToExport.append(completeness)
columnsToExport.append(centralization_index)
columnsToExport.append(network_clustering)
columnsToExport.append(averagePLs)
columnsToExport.append(diameters)
else:
print ("error, all file series should have the same number of elements")
sys.exit(2)
#transpose the list of list to save data as CSV and export as CSV file
#####################
CSVdata = map(list, zip(*columnsToExport))
with open(os.path.join(outputDir, 'graphAnalysis'+serie+'.csv'),'w', newline='') as csvOutput:
CSVWriter = csv.writer(csvOutput, delimiter=";")
# CSVWriter.writerow(headline)
for column in CSVdata:
CSVWriter.writerow(column)
## Box PLOT number of committers ############################################################
#############################################################################################
# notched plot
plt.figure(figsize=(2,7))
bplot = plt.boxplot(nrCommitters,vert=True, patch_artist=True)
for patch in bplot['boxes']:
patch.set_facecolor('#b4b4b4')
for median in bplot['medians']:
median.set(color='k')
ax = plt.gca()
ax.set_ylabel('Number of committers')
ax.get_xaxis().set_ticks([])
ax.set_ylim([0,40])
ax.spines['top'].set_visible(False)
ax.spines['left'].set_visible(False)
ax.spines['right'].set_visible(False)
plt.savefig(os.path.join(outputDir, 'committers_per_project.'+serie+'.boxplot.svg'), bbox_inches="tight")