From b67baaaec4b9445dafbdf93e829352124611a31a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20=C5=BDitn=C3=BD?= <jakubzitny@avocode.com>
Date: Tue, 19 Jul 2016 11:10:22 +0200
Subject: [PATCH] fix traversing in file-level tokenizer input

fix #8
---
 tokenizers/file-level/src/tokenizer-directory.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tokenizers/file-level/src/tokenizer-directory.py b/tokenizers/file-level/src/tokenizer-directory.py
index 06a901df5..5d437cb26 100644
--- a/tokenizers/file-level/src/tokenizer-directory.py
+++ b/tokenizers/file-level/src/tokenizer-directory.py
@@ -1 +1 @@
-import loggingfrom multiprocessing import Process, Value, Lockimport reimport osimport collectionsfrom lockfile import LockFileimport hashlibtry:    from configparser import ConfigParserexcept ImportError:    from ConfigParser import ConfigParser # ver. < 3.0# Logging codeFORMAT = '[%(levelname)s] (%(threadName)s) %(message)s'logging.basicConfig(level=logging.DEBUG,format=FORMAT)file_handler = logging.FileHandler('results.log')file_handler.setFormatter(logging.Formatter(FORMAT))logging.getLogger().addHandler(file_handler)config_file = 'config.ini'# instantiateconfig = ConfigParser()# parse existing filetry:	config.read(config_file)except IOError:	print 'config settings not fould'	logging.error('Config file ['+config_file+'] not found')	sys.exit()# Provided by the userN_PROCESSES = config.getint('Main', 'N_PROCESSES')language_file = config.get('Main', 'language_file')# FoldersPATH_proj_paths = config.get('Folders/Files', 'PATH_proj_paths')PATH_tokens_folder = config.get('Folders/Files', 'PATH_tokens_folder')PATH_bookkeeping_file_folder = config.get('Folders/Files', 'PATH_bookkeeping_file_folder')PATH_bookkeeping_proj_folder = config.get('Folders/Files', 'PATH_bookkeeping_proj_folder')PATH_projects_success = config.get('Folders/Files', 'PATH_projects_success')PATH_project_starting_index = config.get('Folders/Files', 'PATH_project_starting_index')PATH_projects_fail = config.get('Folders/Files', 'PATH_projects_fail')try:	config.read(language_file)except IOError:	print 'Language settings not fould'	logging.error('Language settings ['+language_file+'] not found')	sys.exit()# Read language settingsseparators = config.get('Language', 'separators').split(' ')file_extensions = config.get('Language', 'file_extensions').split(' ')comment_end_of_line = config.get('Language', 'comment_inline')comment_open_tag = re.escape(config.get('Language', 'comment_open_tag'))comment_close_tag = re.escape(config.get('Language', 'comment_close_tag'))def tokenizer(proj_id, proj_path, file_starting_id, FILE_tokens, FILE_bookkeeping_file, FILE_bookkeeping_proj):	if not os.path.exists(proj_path):		logging.error('Project not found <'+proj_id+','+proj_path+'>')		# Important to have a global lock on this file because it is shared		lock = LockFile(PATH_projects_fail)		with lock:			with open(PATH_projects_fail,'a+') as project_fail:				project_fail.write(proj_path+'\n')		return	# In case process names need to be logged	# process_name = '['+mp.current_process().name+'] '	# logging.info(process_name+'Starting proj <'+proj_id+','+proj_path+'>')	all_files = []	for (dirpath, dirnames, filenames) in os.walk(proj_path):		aux_list = []		for extension in file_extensions:			aux = [x for x in filenames if x.endswith(extension)]			#aux = [os.path.join(dirpath,x) for x in aux]			aux_list.extend(aux)		all_files.extend(aux_list)	# Increment the shared file_starting_id by the amount of files in the current project	lock = Lock()	with lock:		all_files = zip(range(file_starting_id.value, file_starting_id.value+len(all_files)), all_files)		file_starting_id.value += len(all_files)	for file_id, file_path in all_files:		file_path = proj_path+'/'+file_path		logging.info('Starting file <'+proj_id+','+str(file_id)+','+file_path+'>')		try:			with open(file_path,'r') as myfile:				file_string = myfile.read()		except IOError:			logging.error('File not found <'+proj_id+','+str(file_id)+','+file_path+'>')			continue		# Remove enf of line comments		file_string = re.sub(comment_end_of_line+'.*?\n','',file_string,flags=re.DOTALL)		# Remove tagged comments		file_string = re.sub(comment_open_tag+'.*?'+comment_close_tag,'',file_string,flags=re.DOTALL)		#Transform separators into spaces (remove them)		for x in separators:			file_string = file_string.replace(x,' ')		#Create a list of tokens		file_string = file_string.split()		# Total number of tokens		tokens_count_total = len(file_string)		#Count occurrences		file_string = collections.Counter(file_string)		#Converting Counter to dict because according to StackOverflow is better		file_string=dict(file_string)		# Unique number of tokens		tokens_count_unique = len(file_string)		tokens = []		#SourcererCC formatting		for k, v in file_string.items():			tokens.append(k+'@@::@@'+str(v))		tokens = ','.join(tokens)		# MD5		m = hashlib.md5()		m.update(tokens)		FILE_bookkeeping_file.write(proj_id+','+str(file_id)+','+file_path+'\n')		FILE_tokens.write(proj_id+','+str(file_id)+','+str(tokens_count_total)+','+str(tokens_count_unique)\						+','+m.hexdigest()\						+'@#@'+tokens+'\n')	FILE_bookkeeping_proj.write(proj_id+','+proj_path+'\n')	# Important to have a global loc on this file because it is shared	lock = LockFile(PATH_projects_success)	with lock:		with open(PATH_projects_success,'a+') as project_success:			project_success.write(proj_path+'\n')def tokenize(list_projects, FILE_tokens_name, FILE_bookkeeping_file_name, FILE_bookkeeping_proj_name, file_starting_id):	# Each tokenize will represent a new process	with open(FILE_tokens_name, 'w') as FILE_tokens, \		open(FILE_bookkeeping_file_name, 'w') as FILE_bookkeeping_file, \		open(FILE_bookkeeping_proj_name, 'w') as FILE_bookkeeping_proj:		for proj_id, proj_path in list_projects:			tokenizer(str(proj_id), proj_path, file_starting_id, FILE_tokens, FILE_bookkeeping_file, FILE_bookkeeping_proj)if __name__ == '__main__':	#In the main file we:	#	create directories if they do not exist	#	read list of PATH_projects_success, if exists, and do not process these again	# 	each process needs a unique file with tokens and file and project	# 		bookkeeping in the proper folders	#	start N_PROCESSES, and give them [(unique_id, proj_path)]	if not os.path.exists(PATH_tokens_folder):		os.makedirs(PATH_tokens_folder)	if not os.path.exists(PATH_bookkeeping_file_folder):		os.makedirs(PATH_bookkeeping_file_folder)	if not os.path.exists(PATH_bookkeeping_proj_folder):		os.makedirs(PATH_bookkeeping_proj_folder)	proj_paths = []	with open(PATH_proj_paths) as f:		for line in f:			proj_paths.append(line.strip('\n'))	projects_success = []	try:		with open(PATH_projects_success,'r') as f:			for line in f:				projects_success.append(line.strip().strip('\n'))	except IOError as e:		logging.info('File '+PATH_projects_success+' no found')	projects_starting_index = 0	proj_paths = list(set(proj_paths) - set(projects_success))	# Initialize projects_starting_index with previous logged number	if not os.path.exists(PATH_project_starting_index):		with open(PATH_project_starting_index, 'w') as FILE_project_starting_index:			FILE_project_starting_index.write(str(len(proj_paths))+'\n')	else:		try:			with open(PATH_project_starting_index, 'r') as FILE_project_starting_index:				projects_starting_index = int(FILE_project_starting_index.readline().strip('\n'))		except ValueError:			projects_starting_index = 0		with open(PATH_project_starting_index, 'w') as FILE_project_starting_index:			FILE_project_starting_index.write(str(projects_starting_index+len(proj_paths))+'\n')	proj_paths = zip(range(projects_starting_index, len(proj_paths)+projects_starting_index),proj_paths)	#Split list of projects into N_PROCESSES lists	proj_paths_list = [ proj_paths[i::N_PROCESSES] for i in xrange(N_PROCESSES) ]	# Multiprocessing with N_PROCESSES	processes = []	# Multiprocessing shared variable instance for recording file_id	file_starting_id = Value('i', 0)	process_num = 0	n =0	for input_process in proj_paths_list:		# Skip empty sublists		if len(input_process) == 0:			continue		process_num += 1		FILE_tokens_name = PATH_tokens_folder+'/'+'tokens_'+str(n)+'.txt'		FILE_bookkeeping_file_name = PATH_bookkeeping_file_folder+'/'+'bookkeeping_file_'+str(n)+'.txt'		FILE_bookkeeping_proj_name = PATH_bookkeeping_proj_folder+'/'+'bookkeeping_proj_'+str(n)+'.txt'		while (os.path.isfile(FILE_tokens_name) and os.path.isfile(FILE_bookkeeping_file_name) and os.path.isfile(FILE_bookkeeping_proj_name)):			n += 1			FILE_tokens_name = PATH_tokens_folder+'/'+'tokens_'+str(n)+'.txt'			FILE_bookkeeping_file_name = PATH_bookkeeping_file_folder+'/'+'bookkeeping_file_'+str(n)+'.txt'			FILE_bookkeeping_proj_name = PATH_bookkeeping_proj_folder+'/'+'bookkeeping_proj_'+str(n)+'.txt'		n += 1		processes.append(Process(name='Process '+str(process_num), target=tokenize, args=(input_process, FILE_tokens_name, FILE_bookkeeping_file_name, FILE_bookkeeping_proj_name, file_starting_id, )))	for proc in processes:		proc.start()		logging.info(proc.name)	for proc in processes:		proc.join()
\ No newline at end of file
+import loggingfrom multiprocessing import Process, Value, Lockimport reimport osimport collectionsfrom lockfile import LockFileimport hashlibtry:    from configparser import ConfigParserexcept ImportError:    from ConfigParser import ConfigParser # ver. < 3.0# Logging codeFORMAT = '[%(levelname)s] (%(threadName)s) %(message)s'logging.basicConfig(level=logging.DEBUG,format=FORMAT)file_handler = logging.FileHandler('results.log')file_handler.setFormatter(logging.Formatter(FORMAT))logging.getLogger().addHandler(file_handler)config_file = 'config.ini'# instantiateconfig = ConfigParser()# parse existing filetry:	config.read(config_file)except IOError:	print 'config settings not fould'	logging.error('Config file ['+config_file+'] not found')	sys.exit()# Provided by the userN_PROCESSES = config.getint('Main', 'N_PROCESSES')language_file = config.get('Main', 'language_file')# FoldersPATH_proj_paths = config.get('Folders/Files', 'PATH_proj_paths')PATH_tokens_folder = config.get('Folders/Files', 'PATH_tokens_folder')PATH_bookkeeping_file_folder = config.get('Folders/Files', 'PATH_bookkeeping_file_folder')PATH_bookkeeping_proj_folder = config.get('Folders/Files', 'PATH_bookkeeping_proj_folder')PATH_projects_success = config.get('Folders/Files', 'PATH_projects_success')PATH_project_starting_index = config.get('Folders/Files', 'PATH_project_starting_index')PATH_projects_fail = config.get('Folders/Files', 'PATH_projects_fail')try:	config.read(language_file)except IOError:	print 'Language settings not fould'	logging.error('Language settings ['+language_file+'] not found')	sys.exit()# Read language settingsseparators = config.get('Language', 'separators').split(' ')file_extensions = config.get('Language', 'file_extensions').split(' ')comment_end_of_line = config.get('Language', 'comment_inline')comment_open_tag = re.escape(config.get('Language', 'comment_open_tag'))comment_close_tag = re.escape(config.get('Language', 'comment_close_tag'))def tokenizer(proj_id, proj_path, file_starting_id, FILE_tokens, FILE_bookkeeping_file, FILE_bookkeeping_proj):	if not os.path.exists(proj_path):		logging.error('Project not found <'+proj_id+','+proj_path+'>')		# Important to have a global lock on this file because it is shared		lock = LockFile(PATH_projects_fail)		with lock:			with open(PATH_projects_fail,'a+') as project_fail:				project_fail.write(proj_path+'\n')		return	# In case process names need to be logged	# process_name = '['+mp.current_process().name+'] '	# logging.info(process_name+'Starting proj <'+proj_id+','+proj_path+'>')	all_files = []	for (dirpath, dirnames, filenames) in os.walk(proj_path):		aux_list = []		for extension in file_extensions:			aux = [x for x in filenames if x.endswith(extension)]			aux = [os.path.join(dirpath,x) for x in aux]			aux_list.extend(aux)		all_files.extend(aux_list)	# Increment the shared file_starting_id by the amount of files in the current project	lock = Lock()	with lock:		all_files = zip(range(file_starting_id.value, file_starting_id.value+len(all_files)), all_files)		file_starting_id.value += len(all_files)	for file_id, file_path in all_files:		logging.info('Starting file <'+proj_id+','+str(file_id)+','+file_path+'>')		try:			with open(file_path,'r') as myfile:				file_string = myfile.read()		except IOError:			logging.error('File not found <'+proj_id+','+str(file_id)+','+file_path+'>')			continue		# Remove enf of line comments		file_string = re.sub(comment_end_of_line+'.*?\n','',file_string,flags=re.DOTALL)		# Remove tagged comments		file_string = re.sub(comment_open_tag+'.*?'+comment_close_tag,'',file_string,flags=re.DOTALL)		#Transform separators into spaces (remove them)		for x in separators:			file_string = file_string.replace(x,' ')		#Create a list of tokens		file_string = file_string.split()		# Total number of tokens		tokens_count_total = len(file_string)		#Count occurrences		file_string = collections.Counter(file_string)		#Converting Counter to dict because according to StackOverflow is better		file_string=dict(file_string)		# Unique number of tokens		tokens_count_unique = len(file_string)		tokens = []		#SourcererCC formatting		for k, v in file_string.items():			tokens.append(k+'@@::@@'+str(v))		tokens = ','.join(tokens)		# MD5		m = hashlib.md5()		m.update(tokens)		FILE_bookkeeping_file.write(proj_id+','+str(file_id)+','+file_path+'\n')		FILE_tokens.write(proj_id+','+str(file_id)+','+str(tokens_count_total)+','+str(tokens_count_unique)\						+','+m.hexdigest()\						+'@#@'+tokens+'\n')	FILE_bookkeeping_proj.write(proj_id+','+proj_path+'\n')	# Important to have a global loc on this file because it is shared	lock = LockFile(PATH_projects_success)	with lock:		with open(PATH_projects_success,'a+') as project_success:			project_success.write(proj_path+'\n')def tokenize(list_projects, FILE_tokens_name, FILE_bookkeeping_file_name, FILE_bookkeeping_proj_name, file_starting_id):	# Each tokenize will represent a new process	with open(FILE_tokens_name, 'w') as FILE_tokens, \		open(FILE_bookkeeping_file_name, 'w') as FILE_bookkeeping_file, \		open(FILE_bookkeeping_proj_name, 'w') as FILE_bookkeeping_proj:		for proj_id, proj_path in list_projects:			tokenizer(str(proj_id), proj_path, file_starting_id, FILE_tokens, FILE_bookkeeping_file, FILE_bookkeeping_proj)if __name__ == '__main__':	#In the main file we:	#	create directories if they do not exist	#	read list of PATH_projects_success, if exists, and do not process these again	# 	each process needs a unique file with tokens and file and project	# 		bookkeeping in the proper folders	#	start N_PROCESSES, and give them [(unique_id, proj_path)]	if not os.path.exists(PATH_tokens_folder):		os.makedirs(PATH_tokens_folder)	if not os.path.exists(PATH_bookkeeping_file_folder):		os.makedirs(PATH_bookkeeping_file_folder)	if not os.path.exists(PATH_bookkeeping_proj_folder):		os.makedirs(PATH_bookkeeping_proj_folder)	proj_paths = []	with open(PATH_proj_paths) as f:		for line in f:			proj_paths.append(line.strip('\n'))	projects_success = []	try:		with open(PATH_projects_success,'r') as f:			for line in f:				projects_success.append(line.strip().strip('\n'))	except IOError as e:		logging.info('File '+PATH_projects_success+' no found')	projects_starting_index = 0	proj_paths = list(set(proj_paths) - set(projects_success))	# Initialize projects_starting_index with previous logged number	if not os.path.exists(PATH_project_starting_index):		with open(PATH_project_starting_index, 'w') as FILE_project_starting_index:			FILE_project_starting_index.write(str(len(proj_paths))+'\n')	else:		try:			with open(PATH_project_starting_index, 'r') as FILE_project_starting_index:				projects_starting_index = int(FILE_project_starting_index.readline().strip('\n'))		except ValueError:			projects_starting_index = 0		with open(PATH_project_starting_index, 'w') as FILE_project_starting_index:			FILE_project_starting_index.write(str(projects_starting_index+len(proj_paths))+'\n')	proj_paths = zip(range(projects_starting_index, len(proj_paths)+projects_starting_index),proj_paths)	#Split list of projects into N_PROCESSES lists	proj_paths_list = [ proj_paths[i::N_PROCESSES] for i in xrange(N_PROCESSES) ]	# Multiprocessing with N_PROCESSES	processes = []	# Multiprocessing shared variable instance for recording file_id	file_starting_id = Value('i', 0)	process_num = 0	n =0	for input_process in proj_paths_list:		# Skip empty sublists		if len(input_process) == 0:			continue		process_num += 1		FILE_tokens_name = PATH_tokens_folder+'/'+'tokens_'+str(n)+'.txt'		FILE_bookkeeping_file_name = PATH_bookkeeping_file_folder+'/'+'bookkeeping_file_'+str(n)+'.txt'		FILE_bookkeeping_proj_name = PATH_bookkeeping_proj_folder+'/'+'bookkeeping_proj_'+str(n)+'.txt'		while (os.path.isfile(FILE_tokens_name) and os.path.isfile(FILE_bookkeeping_file_name) and os.path.isfile(FILE_bookkeeping_proj_name)):			n += 1			FILE_tokens_name = PATH_tokens_folder+'/'+'tokens_'+str(n)+'.txt'			FILE_bookkeeping_file_name = PATH_bookkeeping_file_folder+'/'+'bookkeeping_file_'+str(n)+'.txt'			FILE_bookkeeping_proj_name = PATH_bookkeeping_proj_folder+'/'+'bookkeeping_proj_'+str(n)+'.txt'		n += 1		processes.append(Process(name='Process '+str(process_num), target=tokenize, args=(input_process, FILE_tokens_name, FILE_bookkeeping_file_name, FILE_bookkeeping_proj_name, file_starting_id, )))	for proc in processes:		proc.start()		logging.info(proc.name)	for proc in processes:		proc.join()
\ No newline at end of file