From b67baaaec4b9445dafbdf93e829352124611a31a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jakub=20=C5=BDitn=C3=BD?= Date: Tue, 19 Jul 2016 11:10:22 +0200 Subject: [PATCH] fix traversing in file-level tokenizer input fix #8 --- tokenizers/file-level/src/tokenizer-directory.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tokenizers/file-level/src/tokenizer-directory.py b/tokenizers/file-level/src/tokenizer-directory.py index 06a901df5..5d437cb26 100644 --- a/tokenizers/file-level/src/tokenizer-directory.py +++ b/tokenizers/file-level/src/tokenizer-directory.py @@ -1 +1 @@ -import logging from multiprocessing import Process, Value, Lock import re import os import collections from lockfile import LockFile import hashlib try: from configparser import ConfigParser except ImportError: from ConfigParser import ConfigParser # ver. < 3.0 # Logging code FORMAT = '[%(levelname)s] (%(threadName)s) %(message)s' logging.basicConfig(level=logging.DEBUG,format=FORMAT) file_handler = logging.FileHandler('results.log') file_handler.setFormatter(logging.Formatter(FORMAT)) logging.getLogger().addHandler(file_handler) config_file = 'config.ini' # instantiate config = ConfigParser() # parse existing file try: config.read(config_file) except IOError: print 'config settings not fould' logging.error('Config file ['+config_file+'] not found') sys.exit() # Provided by the user N_PROCESSES = config.getint('Main', 'N_PROCESSES') language_file = config.get('Main', 'language_file') # Folders PATH_proj_paths = config.get('Folders/Files', 'PATH_proj_paths') PATH_tokens_folder = config.get('Folders/Files', 'PATH_tokens_folder') PATH_bookkeeping_file_folder = config.get('Folders/Files', 'PATH_bookkeeping_file_folder') PATH_bookkeeping_proj_folder = config.get('Folders/Files', 'PATH_bookkeeping_proj_folder') PATH_projects_success = config.get('Folders/Files', 'PATH_projects_success') PATH_project_starting_index = config.get('Folders/Files', 'PATH_project_starting_index') PATH_projects_fail = config.get('Folders/Files', 'PATH_projects_fail') try: config.read(language_file) except IOError: print 'Language settings not fould' logging.error('Language settings ['+language_file+'] not found') sys.exit() # Read language settings separators = config.get('Language', 'separators').split(' ') file_extensions = config.get('Language', 'file_extensions').split(' ') comment_end_of_line = config.get('Language', 'comment_inline') comment_open_tag = re.escape(config.get('Language', 'comment_open_tag')) comment_close_tag = re.escape(config.get('Language', 'comment_close_tag')) def tokenizer(proj_id, proj_path, file_starting_id, FILE_tokens, FILE_bookkeeping_file, FILE_bookkeeping_proj): if not os.path.exists(proj_path): logging.error('Project not found <'+proj_id+','+proj_path+'>') # Important to have a global lock on this file because it is shared lock = LockFile(PATH_projects_fail) with lock: with open(PATH_projects_fail,'a+') as project_fail: project_fail.write(proj_path+'\n') return # In case process names need to be logged # process_name = '['+mp.current_process().name+'] ' # logging.info(process_name+'Starting proj <'+proj_id+','+proj_path+'>') all_files = [] for (dirpath, dirnames, filenames) in os.walk(proj_path): aux_list = [] for extension in file_extensions: aux = [x for x in filenames if x.endswith(extension)] #aux = [os.path.join(dirpath,x) for x in aux] aux_list.extend(aux) all_files.extend(aux_list) # Increment the shared file_starting_id by the amount of files in the current project lock = Lock() with lock: all_files = zip(range(file_starting_id.value, file_starting_id.value+len(all_files)), all_files) file_starting_id.value += len(all_files) for file_id, file_path in all_files: file_path = proj_path+'/'+file_path logging.info('Starting file <'+proj_id+','+str(file_id)+','+file_path+'>') try: with open(file_path,'r') as myfile: file_string = myfile.read() except IOError: logging.error('File not found <'+proj_id+','+str(file_id)+','+file_path+'>') continue # Remove enf of line comments file_string = re.sub(comment_end_of_line+'.*?\n','',file_string,flags=re.DOTALL) # Remove tagged comments file_string = re.sub(comment_open_tag+'.*?'+comment_close_tag,'',file_string,flags=re.DOTALL) #Transform separators into spaces (remove them) for x in separators: file_string = file_string.replace(x,' ') #Create a list of tokens file_string = file_string.split() # Total number of tokens tokens_count_total = len(file_string) #Count occurrences file_string = collections.Counter(file_string) #Converting Counter to dict because according to StackOverflow is better file_string=dict(file_string) # Unique number of tokens tokens_count_unique = len(file_string) tokens = [] #SourcererCC formatting for k, v in file_string.items(): tokens.append(k+'@@::@@'+str(v)) tokens = ','.join(tokens) # MD5 m = hashlib.md5() m.update(tokens) FILE_bookkeeping_file.write(proj_id+','+str(file_id)+','+file_path+'\n') FILE_tokens.write(proj_id+','+str(file_id)+','+str(tokens_count_total)+','+str(tokens_count_unique)\ +','+m.hexdigest()\ +'@#@'+tokens+'\n') FILE_bookkeeping_proj.write(proj_id+','+proj_path+'\n') # Important to have a global loc on this file because it is shared lock = LockFile(PATH_projects_success) with lock: with open(PATH_projects_success,'a+') as project_success: project_success.write(proj_path+'\n') def tokenize(list_projects, FILE_tokens_name, FILE_bookkeeping_file_name, FILE_bookkeeping_proj_name, file_starting_id): # Each tokenize will represent a new process with open(FILE_tokens_name, 'w') as FILE_tokens, \ open(FILE_bookkeeping_file_name, 'w') as FILE_bookkeeping_file, \ open(FILE_bookkeeping_proj_name, 'w') as FILE_bookkeeping_proj: for proj_id, proj_path in list_projects: tokenizer(str(proj_id), proj_path, file_starting_id, FILE_tokens, FILE_bookkeeping_file, FILE_bookkeeping_proj) if __name__ == '__main__': #In the main file we: # create directories if they do not exist # read list of PATH_projects_success, if exists, and do not process these again # each process needs a unique file with tokens and file and project # bookkeeping in the proper folders # start N_PROCESSES, and give them [(unique_id, proj_path)] if not os.path.exists(PATH_tokens_folder): os.makedirs(PATH_tokens_folder) if not os.path.exists(PATH_bookkeeping_file_folder): os.makedirs(PATH_bookkeeping_file_folder) if not os.path.exists(PATH_bookkeeping_proj_folder): os.makedirs(PATH_bookkeeping_proj_folder) proj_paths = [] with open(PATH_proj_paths) as f: for line in f: proj_paths.append(line.strip('\n')) projects_success = [] try: with open(PATH_projects_success,'r') as f: for line in f: projects_success.append(line.strip().strip('\n')) except IOError as e: logging.info('File '+PATH_projects_success+' no found') projects_starting_index = 0 proj_paths = list(set(proj_paths) - set(projects_success)) # Initialize projects_starting_index with previous logged number if not os.path.exists(PATH_project_starting_index): with open(PATH_project_starting_index, 'w') as FILE_project_starting_index: FILE_project_starting_index.write(str(len(proj_paths))+'\n') else: try: with open(PATH_project_starting_index, 'r') as FILE_project_starting_index: projects_starting_index = int(FILE_project_starting_index.readline().strip('\n')) except ValueError: projects_starting_index = 0 with open(PATH_project_starting_index, 'w') as FILE_project_starting_index: FILE_project_starting_index.write(str(projects_starting_index+len(proj_paths))+'\n') proj_paths = zip(range(projects_starting_index, len(proj_paths)+projects_starting_index),proj_paths) #Split list of projects into N_PROCESSES lists proj_paths_list = [ proj_paths[i::N_PROCESSES] for i in xrange(N_PROCESSES) ] # Multiprocessing with N_PROCESSES processes = [] # Multiprocessing shared variable instance for recording file_id file_starting_id = Value('i', 0) process_num = 0 n =0 for input_process in proj_paths_list: # Skip empty sublists if len(input_process) == 0: continue process_num += 1 FILE_tokens_name = PATH_tokens_folder+'/'+'tokens_'+str(n)+'.txt' FILE_bookkeeping_file_name = PATH_bookkeeping_file_folder+'/'+'bookkeeping_file_'+str(n)+'.txt' FILE_bookkeeping_proj_name = PATH_bookkeeping_proj_folder+'/'+'bookkeeping_proj_'+str(n)+'.txt' while (os.path.isfile(FILE_tokens_name) and os.path.isfile(FILE_bookkeeping_file_name) and os.path.isfile(FILE_bookkeeping_proj_name)): n += 1 FILE_tokens_name = PATH_tokens_folder+'/'+'tokens_'+str(n)+'.txt' FILE_bookkeeping_file_name = PATH_bookkeeping_file_folder+'/'+'bookkeeping_file_'+str(n)+'.txt' FILE_bookkeeping_proj_name = PATH_bookkeeping_proj_folder+'/'+'bookkeeping_proj_'+str(n)+'.txt' n += 1 processes.append(Process(name='Process '+str(process_num), target=tokenize, args=(input_process, FILE_tokens_name, FILE_bookkeeping_file_name, FILE_bookkeeping_proj_name, file_starting_id, ))) for proc in processes: proc.start() logging.info(proc.name) for proc in processes: proc.join() \ No newline at end of file +import logging from multiprocessing import Process, Value, Lock import re import os import collections from lockfile import LockFile import hashlib try: from configparser import ConfigParser except ImportError: from ConfigParser import ConfigParser # ver. < 3.0 # Logging code FORMAT = '[%(levelname)s] (%(threadName)s) %(message)s' logging.basicConfig(level=logging.DEBUG,format=FORMAT) file_handler = logging.FileHandler('results.log') file_handler.setFormatter(logging.Formatter(FORMAT)) logging.getLogger().addHandler(file_handler) config_file = 'config.ini' # instantiate config = ConfigParser() # parse existing file try: config.read(config_file) except IOError: print 'config settings not fould' logging.error('Config file ['+config_file+'] not found') sys.exit() # Provided by the user N_PROCESSES = config.getint('Main', 'N_PROCESSES') language_file = config.get('Main', 'language_file') # Folders PATH_proj_paths = config.get('Folders/Files', 'PATH_proj_paths') PATH_tokens_folder = config.get('Folders/Files', 'PATH_tokens_folder') PATH_bookkeeping_file_folder = config.get('Folders/Files', 'PATH_bookkeeping_file_folder') PATH_bookkeeping_proj_folder = config.get('Folders/Files', 'PATH_bookkeeping_proj_folder') PATH_projects_success = config.get('Folders/Files', 'PATH_projects_success') PATH_project_starting_index = config.get('Folders/Files', 'PATH_project_starting_index') PATH_projects_fail = config.get('Folders/Files', 'PATH_projects_fail') try: config.read(language_file) except IOError: print 'Language settings not fould' logging.error('Language settings ['+language_file+'] not found') sys.exit() # Read language settings separators = config.get('Language', 'separators').split(' ') file_extensions = config.get('Language', 'file_extensions').split(' ') comment_end_of_line = config.get('Language', 'comment_inline') comment_open_tag = re.escape(config.get('Language', 'comment_open_tag')) comment_close_tag = re.escape(config.get('Language', 'comment_close_tag')) def tokenizer(proj_id, proj_path, file_starting_id, FILE_tokens, FILE_bookkeeping_file, FILE_bookkeeping_proj): if not os.path.exists(proj_path): logging.error('Project not found <'+proj_id+','+proj_path+'>') # Important to have a global lock on this file because it is shared lock = LockFile(PATH_projects_fail) with lock: with open(PATH_projects_fail,'a+') as project_fail: project_fail.write(proj_path+'\n') return # In case process names need to be logged # process_name = '['+mp.current_process().name+'] ' # logging.info(process_name+'Starting proj <'+proj_id+','+proj_path+'>') all_files = [] for (dirpath, dirnames, filenames) in os.walk(proj_path): aux_list = [] for extension in file_extensions: aux = [x for x in filenames if x.endswith(extension)] aux = [os.path.join(dirpath,x) for x in aux] aux_list.extend(aux) all_files.extend(aux_list) # Increment the shared file_starting_id by the amount of files in the current project lock = Lock() with lock: all_files = zip(range(file_starting_id.value, file_starting_id.value+len(all_files)), all_files) file_starting_id.value += len(all_files) for file_id, file_path in all_files: logging.info('Starting file <'+proj_id+','+str(file_id)+','+file_path+'>') try: with open(file_path,'r') as myfile: file_string = myfile.read() except IOError: logging.error('File not found <'+proj_id+','+str(file_id)+','+file_path+'>') continue # Remove enf of line comments file_string = re.sub(comment_end_of_line+'.*?\n','',file_string,flags=re.DOTALL) # Remove tagged comments file_string = re.sub(comment_open_tag+'.*?'+comment_close_tag,'',file_string,flags=re.DOTALL) #Transform separators into spaces (remove them) for x in separators: file_string = file_string.replace(x,' ') #Create a list of tokens file_string = file_string.split() # Total number of tokens tokens_count_total = len(file_string) #Count occurrences file_string = collections.Counter(file_string) #Converting Counter to dict because according to StackOverflow is better file_string=dict(file_string) # Unique number of tokens tokens_count_unique = len(file_string) tokens = [] #SourcererCC formatting for k, v in file_string.items(): tokens.append(k+'@@::@@'+str(v)) tokens = ','.join(tokens) # MD5 m = hashlib.md5() m.update(tokens) FILE_bookkeeping_file.write(proj_id+','+str(file_id)+','+file_path+'\n') FILE_tokens.write(proj_id+','+str(file_id)+','+str(tokens_count_total)+','+str(tokens_count_unique)\ +','+m.hexdigest()\ +'@#@'+tokens+'\n') FILE_bookkeeping_proj.write(proj_id+','+proj_path+'\n') # Important to have a global loc on this file because it is shared lock = LockFile(PATH_projects_success) with lock: with open(PATH_projects_success,'a+') as project_success: project_success.write(proj_path+'\n') def tokenize(list_projects, FILE_tokens_name, FILE_bookkeeping_file_name, FILE_bookkeeping_proj_name, file_starting_id): # Each tokenize will represent a new process with open(FILE_tokens_name, 'w') as FILE_tokens, \ open(FILE_bookkeeping_file_name, 'w') as FILE_bookkeeping_file, \ open(FILE_bookkeeping_proj_name, 'w') as FILE_bookkeeping_proj: for proj_id, proj_path in list_projects: tokenizer(str(proj_id), proj_path, file_starting_id, FILE_tokens, FILE_bookkeeping_file, FILE_bookkeeping_proj) if __name__ == '__main__': #In the main file we: # create directories if they do not exist # read list of PATH_projects_success, if exists, and do not process these again # each process needs a unique file with tokens and file and project # bookkeeping in the proper folders # start N_PROCESSES, and give them [(unique_id, proj_path)] if not os.path.exists(PATH_tokens_folder): os.makedirs(PATH_tokens_folder) if not os.path.exists(PATH_bookkeeping_file_folder): os.makedirs(PATH_bookkeeping_file_folder) if not os.path.exists(PATH_bookkeeping_proj_folder): os.makedirs(PATH_bookkeeping_proj_folder) proj_paths = [] with open(PATH_proj_paths) as f: for line in f: proj_paths.append(line.strip('\n')) projects_success = [] try: with open(PATH_projects_success,'r') as f: for line in f: projects_success.append(line.strip().strip('\n')) except IOError as e: logging.info('File '+PATH_projects_success+' no found') projects_starting_index = 0 proj_paths = list(set(proj_paths) - set(projects_success)) # Initialize projects_starting_index with previous logged number if not os.path.exists(PATH_project_starting_index): with open(PATH_project_starting_index, 'w') as FILE_project_starting_index: FILE_project_starting_index.write(str(len(proj_paths))+'\n') else: try: with open(PATH_project_starting_index, 'r') as FILE_project_starting_index: projects_starting_index = int(FILE_project_starting_index.readline().strip('\n')) except ValueError: projects_starting_index = 0 with open(PATH_project_starting_index, 'w') as FILE_project_starting_index: FILE_project_starting_index.write(str(projects_starting_index+len(proj_paths))+'\n') proj_paths = zip(range(projects_starting_index, len(proj_paths)+projects_starting_index),proj_paths) #Split list of projects into N_PROCESSES lists proj_paths_list = [ proj_paths[i::N_PROCESSES] for i in xrange(N_PROCESSES) ] # Multiprocessing with N_PROCESSES processes = [] # Multiprocessing shared variable instance for recording file_id file_starting_id = Value('i', 0) process_num = 0 n =0 for input_process in proj_paths_list: # Skip empty sublists if len(input_process) == 0: continue process_num += 1 FILE_tokens_name = PATH_tokens_folder+'/'+'tokens_'+str(n)+'.txt' FILE_bookkeeping_file_name = PATH_bookkeeping_file_folder+'/'+'bookkeeping_file_'+str(n)+'.txt' FILE_bookkeeping_proj_name = PATH_bookkeeping_proj_folder+'/'+'bookkeeping_proj_'+str(n)+'.txt' while (os.path.isfile(FILE_tokens_name) and os.path.isfile(FILE_bookkeeping_file_name) and os.path.isfile(FILE_bookkeeping_proj_name)): n += 1 FILE_tokens_name = PATH_tokens_folder+'/'+'tokens_'+str(n)+'.txt' FILE_bookkeeping_file_name = PATH_bookkeeping_file_folder+'/'+'bookkeeping_file_'+str(n)+'.txt' FILE_bookkeeping_proj_name = PATH_bookkeeping_proj_folder+'/'+'bookkeeping_proj_'+str(n)+'.txt' n += 1 processes.append(Process(name='Process '+str(process_num), target=tokenize, args=(input_process, FILE_tokens_name, FILE_bookkeeping_file_name, FILE_bookkeeping_proj_name, file_starting_id, ))) for proc in processes: proc.start() logging.info(proc.name) for proc in processes: proc.join() \ No newline at end of file