Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix traversing in file-level tokenizer input #9

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion tokenizers/file-level/src/tokenizer-directory.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
import loggingfrom multiprocessing import Process, Value, Lockimport reimport osimport collectionsfrom lockfile import LockFileimport hashlibtry: from configparser import ConfigParserexcept ImportError: from ConfigParser import ConfigParser # ver. < 3.0# Logging codeFORMAT = '[%(levelname)s] (%(threadName)s) %(message)s'logging.basicConfig(level=logging.DEBUG,format=FORMAT)file_handler = logging.FileHandler('results.log')file_handler.setFormatter(logging.Formatter(FORMAT))logging.getLogger().addHandler(file_handler)config_file = 'config.ini'# instantiateconfig = ConfigParser()# parse existing filetry: config.read(config_file)except IOError: print 'config settings not fould' logging.error('Config file ['+config_file+'] not found') sys.exit()# Provided by the userN_PROCESSES = config.getint('Main', 'N_PROCESSES')language_file = config.get('Main', 'language_file')# FoldersPATH_proj_paths = config.get('Folders/Files', 'PATH_proj_paths')PATH_tokens_folder = config.get('Folders/Files', 'PATH_tokens_folder')PATH_bookkeeping_file_folder = config.get('Folders/Files', 'PATH_bookkeeping_file_folder')PATH_bookkeeping_proj_folder = config.get('Folders/Files', 'PATH_bookkeeping_proj_folder')PATH_projects_success = config.get('Folders/Files', 'PATH_projects_success')PATH_project_starting_index = config.get('Folders/Files', 'PATH_project_starting_index')PATH_projects_fail = config.get('Folders/Files', 'PATH_projects_fail')try: config.read(language_file)except IOError: print 'Language settings not fould' logging.error('Language settings ['+language_file+'] not found') sys.exit()# Read language settingsseparators = config.get('Language', 'separators').split(' ')file_extensions = config.get('Language', 'file_extensions').split(' ')comment_end_of_line = config.get('Language', 'comment_inline')comment_open_tag = re.escape(config.get('Language', 'comment_open_tag'))comment_close_tag = re.escape(config.get('Language', 'comment_close_tag'))def tokenizer(proj_id, proj_path, file_starting_id, FILE_tokens, FILE_bookkeeping_file, FILE_bookkeeping_proj): if not os.path.exists(proj_path): logging.error('Project not found <'+proj_id+','+proj_path+'>') # Important to have a global lock on this file because it is shared lock = LockFile(PATH_projects_fail) with lock: with open(PATH_projects_fail,'a+') as project_fail: project_fail.write(proj_path+'\n') return # In case process names need to be logged # process_name = '['+mp.current_process().name+'] ' # logging.info(process_name+'Starting proj <'+proj_id+','+proj_path+'>') all_files = [] for (dirpath, dirnames, filenames) in os.walk(proj_path): aux_list = [] for extension in file_extensions: aux = [x for x in filenames if x.endswith(extension)] #aux = [os.path.join(dirpath,x) for x in aux] aux_list.extend(aux) all_files.extend(aux_list) # Increment the shared file_starting_id by the amount of files in the current project lock = Lock() with lock: all_files = zip(range(file_starting_id.value, file_starting_id.value+len(all_files)), all_files) file_starting_id.value += len(all_files) for file_id, file_path in all_files: file_path = proj_path+'/'+file_path logging.info('Starting file <'+proj_id+','+str(file_id)+','+file_path+'>') try: with open(file_path,'r') as myfile: file_string = myfile.read() except IOError: logging.error('File not found <'+proj_id+','+str(file_id)+','+file_path+'>') continue # Remove enf of line comments file_string = re.sub(comment_end_of_line+'.*?\n','',file_string,flags=re.DOTALL) # Remove tagged comments file_string = re.sub(comment_open_tag+'.*?'+comment_close_tag,'',file_string,flags=re.DOTALL) #Transform separators into spaces (remove them) for x in separators: file_string = file_string.replace(x,' ') #Create a list of tokens file_string = file_string.split() # Total number of tokens tokens_count_total = len(file_string) #Count occurrences file_string = collections.Counter(file_string) #Converting Counter to dict because according to StackOverflow is better file_string=dict(file_string) # Unique number of tokens tokens_count_unique = len(file_string) tokens = [] #SourcererCC formatting for k, v in file_string.items(): tokens.append(k+'@@::@@'+str(v)) tokens = ','.join(tokens) # MD5 m = hashlib.md5() m.update(tokens) FILE_bookkeeping_file.write(proj_id+','+str(file_id)+','+file_path+'\n') FILE_tokens.write(proj_id+','+str(file_id)+','+str(tokens_count_total)+','+str(tokens_count_unique)\ +','+m.hexdigest()\ +'@#@'+tokens+'\n') FILE_bookkeeping_proj.write(proj_id+','+proj_path+'\n') # Important to have a global loc on this file because it is shared lock = LockFile(PATH_projects_success) with lock: with open(PATH_projects_success,'a+') as project_success: project_success.write(proj_path+'\n')def tokenize(list_projects, FILE_tokens_name, FILE_bookkeeping_file_name, FILE_bookkeeping_proj_name, file_starting_id): # Each tokenize will represent a new process with open(FILE_tokens_name, 'w') as FILE_tokens, \ open(FILE_bookkeeping_file_name, 'w') as FILE_bookkeeping_file, \ open(FILE_bookkeeping_proj_name, 'w') as FILE_bookkeeping_proj: for proj_id, proj_path in list_projects: tokenizer(str(proj_id), proj_path, file_starting_id, FILE_tokens, FILE_bookkeeping_file, FILE_bookkeeping_proj)if __name__ == '__main__': #In the main file we: # create directories if they do not exist # read list of PATH_projects_success, if exists, and do not process these again # each process needs a unique file with tokens and file and project # bookkeeping in the proper folders # start N_PROCESSES, and give them [(unique_id, proj_path)] if not os.path.exists(PATH_tokens_folder): os.makedirs(PATH_tokens_folder) if not os.path.exists(PATH_bookkeeping_file_folder): os.makedirs(PATH_bookkeeping_file_folder) if not os.path.exists(PATH_bookkeeping_proj_folder): os.makedirs(PATH_bookkeeping_proj_folder) proj_paths = [] with open(PATH_proj_paths) as f: for line in f: proj_paths.append(line.strip('\n')) projects_success = [] try: with open(PATH_projects_success,'r') as f: for line in f: projects_success.append(line.strip().strip('\n')) except IOError as e: logging.info('File '+PATH_projects_success+' no found') projects_starting_index = 0 proj_paths = list(set(proj_paths) - set(projects_success)) # Initialize projects_starting_index with previous logged number if not os.path.exists(PATH_project_starting_index): with open(PATH_project_starting_index, 'w') as FILE_project_starting_index: FILE_project_starting_index.write(str(len(proj_paths))+'\n') else: try: with open(PATH_project_starting_index, 'r') as FILE_project_starting_index: projects_starting_index = int(FILE_project_starting_index.readline().strip('\n')) except ValueError: projects_starting_index = 0 with open(PATH_project_starting_index, 'w') as FILE_project_starting_index: FILE_project_starting_index.write(str(projects_starting_index+len(proj_paths))+'\n') proj_paths = zip(range(projects_starting_index, len(proj_paths)+projects_starting_index),proj_paths) #Split list of projects into N_PROCESSES lists proj_paths_list = [ proj_paths[i::N_PROCESSES] for i in xrange(N_PROCESSES) ] # Multiprocessing with N_PROCESSES processes = [] # Multiprocessing shared variable instance for recording file_id file_starting_id = Value('i', 0) process_num = 0 n =0 for input_process in proj_paths_list: # Skip empty sublists if len(input_process) == 0: continue process_num += 1 FILE_tokens_name = PATH_tokens_folder+'/'+'tokens_'+str(n)+'.txt' FILE_bookkeeping_file_name = PATH_bookkeeping_file_folder+'/'+'bookkeeping_file_'+str(n)+'.txt' FILE_bookkeeping_proj_name = PATH_bookkeeping_proj_folder+'/'+'bookkeeping_proj_'+str(n)+'.txt' while (os.path.isfile(FILE_tokens_name) and os.path.isfile(FILE_bookkeeping_file_name) and os.path.isfile(FILE_bookkeeping_proj_name)): n += 1 FILE_tokens_name = PATH_tokens_folder+'/'+'tokens_'+str(n)+'.txt' FILE_bookkeeping_file_name = PATH_bookkeeping_file_folder+'/'+'bookkeeping_file_'+str(n)+'.txt' FILE_bookkeeping_proj_name = PATH_bookkeeping_proj_folder+'/'+'bookkeeping_proj_'+str(n)+'.txt' n += 1 processes.append(Process(name='Process '+str(process_num), target=tokenize, args=(input_process, FILE_tokens_name, FILE_bookkeeping_file_name, FILE_bookkeeping_proj_name, file_starting_id, ))) for proc in processes: proc.start() logging.info(proc.name) for proc in processes: proc.join()
import loggingfrom multiprocessing import Process, Value, Lockimport reimport osimport collectionsfrom lockfile import LockFileimport hashlibtry: from configparser import ConfigParserexcept ImportError: from ConfigParser import ConfigParser # ver. < 3.0# Logging codeFORMAT = '[%(levelname)s] (%(threadName)s) %(message)s'logging.basicConfig(level=logging.DEBUG,format=FORMAT)file_handler = logging.FileHandler('results.log')file_handler.setFormatter(logging.Formatter(FORMAT))logging.getLogger().addHandler(file_handler)config_file = 'config.ini'# instantiateconfig = ConfigParser()# parse existing filetry: config.read(config_file)except IOError: print 'config settings not fould' logging.error('Config file ['+config_file+'] not found') sys.exit()# Provided by the userN_PROCESSES = config.getint('Main', 'N_PROCESSES')language_file = config.get('Main', 'language_file')# FoldersPATH_proj_paths = config.get('Folders/Files', 'PATH_proj_paths')PATH_tokens_folder = config.get('Folders/Files', 'PATH_tokens_folder')PATH_bookkeeping_file_folder = config.get('Folders/Files', 'PATH_bookkeeping_file_folder')PATH_bookkeeping_proj_folder = config.get('Folders/Files', 'PATH_bookkeeping_proj_folder')PATH_projects_success = config.get('Folders/Files', 'PATH_projects_success')PATH_project_starting_index = config.get('Folders/Files', 'PATH_project_starting_index')PATH_projects_fail = config.get('Folders/Files', 'PATH_projects_fail')try: config.read(language_file)except IOError: print 'Language settings not fould' logging.error('Language settings ['+language_file+'] not found') sys.exit()# Read language settingsseparators = config.get('Language', 'separators').split(' ')file_extensions = config.get('Language', 'file_extensions').split(' ')comment_end_of_line = config.get('Language', 'comment_inline')comment_open_tag = re.escape(config.get('Language', 'comment_open_tag'))comment_close_tag = re.escape(config.get('Language', 'comment_close_tag'))def tokenizer(proj_id, proj_path, file_starting_id, FILE_tokens, FILE_bookkeeping_file, FILE_bookkeeping_proj): if not os.path.exists(proj_path): logging.error('Project not found <'+proj_id+','+proj_path+'>') # Important to have a global lock on this file because it is shared lock = LockFile(PATH_projects_fail) with lock: with open(PATH_projects_fail,'a+') as project_fail: project_fail.write(proj_path+'\n') return # In case process names need to be logged # process_name = '['+mp.current_process().name+'] ' # logging.info(process_name+'Starting proj <'+proj_id+','+proj_path+'>') all_files = [] for (dirpath, dirnames, filenames) in os.walk(proj_path): aux_list = [] for extension in file_extensions: aux = [x for x in filenames if x.endswith(extension)] aux = [os.path.join(dirpath,x) for x in aux] aux_list.extend(aux) all_files.extend(aux_list) # Increment the shared file_starting_id by the amount of files in the current project lock = Lock() with lock: all_files = zip(range(file_starting_id.value, file_starting_id.value+len(all_files)), all_files) file_starting_id.value += len(all_files) for file_id, file_path in all_files: logging.info('Starting file <'+proj_id+','+str(file_id)+','+file_path+'>') try: with open(file_path,'r') as myfile: file_string = myfile.read() except IOError: logging.error('File not found <'+proj_id+','+str(file_id)+','+file_path+'>') continue # Remove enf of line comments file_string = re.sub(comment_end_of_line+'.*?\n','',file_string,flags=re.DOTALL) # Remove tagged comments file_string = re.sub(comment_open_tag+'.*?'+comment_close_tag,'',file_string,flags=re.DOTALL) #Transform separators into spaces (remove them) for x in separators: file_string = file_string.replace(x,' ') #Create a list of tokens file_string = file_string.split() # Total number of tokens tokens_count_total = len(file_string) #Count occurrences file_string = collections.Counter(file_string) #Converting Counter to dict because according to StackOverflow is better file_string=dict(file_string) # Unique number of tokens tokens_count_unique = len(file_string) tokens = [] #SourcererCC formatting for k, v in file_string.items(): tokens.append(k+'@@::@@'+str(v)) tokens = ','.join(tokens) # MD5 m = hashlib.md5() m.update(tokens) FILE_bookkeeping_file.write(proj_id+','+str(file_id)+','+file_path+'\n') FILE_tokens.write(proj_id+','+str(file_id)+','+str(tokens_count_total)+','+str(tokens_count_unique)\ +','+m.hexdigest()\ +'@#@'+tokens+'\n') FILE_bookkeeping_proj.write(proj_id+','+proj_path+'\n') # Important to have a global loc on this file because it is shared lock = LockFile(PATH_projects_success) with lock: with open(PATH_projects_success,'a+') as project_success: project_success.write(proj_path+'\n')def tokenize(list_projects, FILE_tokens_name, FILE_bookkeeping_file_name, FILE_bookkeeping_proj_name, file_starting_id): # Each tokenize will represent a new process with open(FILE_tokens_name, 'w') as FILE_tokens, \ open(FILE_bookkeeping_file_name, 'w') as FILE_bookkeeping_file, \ open(FILE_bookkeeping_proj_name, 'w') as FILE_bookkeeping_proj: for proj_id, proj_path in list_projects: tokenizer(str(proj_id), proj_path, file_starting_id, FILE_tokens, FILE_bookkeeping_file, FILE_bookkeeping_proj)if __name__ == '__main__': #In the main file we: # create directories if they do not exist # read list of PATH_projects_success, if exists, and do not process these again # each process needs a unique file with tokens and file and project # bookkeeping in the proper folders # start N_PROCESSES, and give them [(unique_id, proj_path)] if not os.path.exists(PATH_tokens_folder): os.makedirs(PATH_tokens_folder) if not os.path.exists(PATH_bookkeeping_file_folder): os.makedirs(PATH_bookkeeping_file_folder) if not os.path.exists(PATH_bookkeeping_proj_folder): os.makedirs(PATH_bookkeeping_proj_folder) proj_paths = [] with open(PATH_proj_paths) as f: for line in f: proj_paths.append(line.strip('\n')) projects_success = [] try: with open(PATH_projects_success,'r') as f: for line in f: projects_success.append(line.strip().strip('\n')) except IOError as e: logging.info('File '+PATH_projects_success+' no found') projects_starting_index = 0 proj_paths = list(set(proj_paths) - set(projects_success)) # Initialize projects_starting_index with previous logged number if not os.path.exists(PATH_project_starting_index): with open(PATH_project_starting_index, 'w') as FILE_project_starting_index: FILE_project_starting_index.write(str(len(proj_paths))+'\n') else: try: with open(PATH_project_starting_index, 'r') as FILE_project_starting_index: projects_starting_index = int(FILE_project_starting_index.readline().strip('\n')) except ValueError: projects_starting_index = 0 with open(PATH_project_starting_index, 'w') as FILE_project_starting_index: FILE_project_starting_index.write(str(projects_starting_index+len(proj_paths))+'\n') proj_paths = zip(range(projects_starting_index, len(proj_paths)+projects_starting_index),proj_paths) #Split list of projects into N_PROCESSES lists proj_paths_list = [ proj_paths[i::N_PROCESSES] for i in xrange(N_PROCESSES) ] # Multiprocessing with N_PROCESSES processes = [] # Multiprocessing shared variable instance for recording file_id file_starting_id = Value('i', 0) process_num = 0 n =0 for input_process in proj_paths_list: # Skip empty sublists if len(input_process) == 0: continue process_num += 1 FILE_tokens_name = PATH_tokens_folder+'/'+'tokens_'+str(n)+'.txt' FILE_bookkeeping_file_name = PATH_bookkeeping_file_folder+'/'+'bookkeeping_file_'+str(n)+'.txt' FILE_bookkeeping_proj_name = PATH_bookkeeping_proj_folder+'/'+'bookkeeping_proj_'+str(n)+'.txt' while (os.path.isfile(FILE_tokens_name) and os.path.isfile(FILE_bookkeeping_file_name) and os.path.isfile(FILE_bookkeeping_proj_name)): n += 1 FILE_tokens_name = PATH_tokens_folder+'/'+'tokens_'+str(n)+'.txt' FILE_bookkeeping_file_name = PATH_bookkeeping_file_folder+'/'+'bookkeeping_file_'+str(n)+'.txt' FILE_bookkeeping_proj_name = PATH_bookkeeping_proj_folder+'/'+'bookkeeping_proj_'+str(n)+'.txt' n += 1 processes.append(Process(name='Process '+str(process_num), target=tokenize, args=(input_process, FILE_tokens_name, FILE_bookkeeping_file_name, FILE_bookkeeping_proj_name, file_starting_id, ))) for proc in processes: proc.start() logging.info(proc.name) for proc in processes: proc.join()
Expand Down