You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
import os
import shutil
# Prefix to only include certain files
FILE_NAME_PREFIX = "wiki_"
# The path to the directory containing the extracted results
START_DIR = "/home/ubuntu/datasets/text"
# The directory to save the results per document
DOCS_SAVE_PATH = "/home/ubuntu/datasets/utf8_wikipedia_data"
def get_all_files():
all_files = []
for dir_name in os.listdir(START_DIR):
dir_path = os.path.join(START_DIR, dir_name)
for file_name in os.listdir(dir_path):
if FILE_NAME_PREFIX not in file_name or file_name[0] == '.':
continue
# Record the file_path
file_path = os.path.join(dir_path, file_name)
all_files.append(file_path)
return all_files
RECORD_END_MARKER = "</doc>"
def extract_records_from_docs(doc_path):
with open(doc_path, 'r') as reader:
lines = reader.readlines()
curr_idx = 0
max_txt_len = 0
while curr_idx < len(lines):
# Get the current document id
curr_doc_line = lines[curr_idx].strip()
doc_line_parts = curr_doc_line.split(" ")
doc_id_str = doc_line_parts[1].split("=")[1]
doc_id = doc_id_str[1 : -1]
# Get the current document lines
doc_lines = []
curr_idx += 1
while RECORD_END_MARKER not in lines[curr_idx]:
curr_line = lines[curr_idx].strip()
if len(curr_line) > 0:
doc_lines.append(curr_line)
curr_idx += 1
# Determine the text to write
if len(doc_lines) > 1:
doc_lines.pop(0)
txt_to_write = "\n".join(doc_lines)
max_txt_len = max(max_txt_len, len(txt_to_write))
save_path = os.path.join(DOCS_SAVE_PATH, doc_id + ".txt")
with open(save_path, 'w+') as writer:
writer.write(txt_to_write)
# Increment to the next record
curr_idx += 1
return max_txt_len
def main():
# Create the save directory
if os.path.exists(DOCS_SAVE_PATH):
shutil.rmtree(DOCS_SAVE_PATH)
os.makedirs(DOCS_SAVE_PATH, exist_ok = True)
# Get the result per file
all_files = get_all_files()
max_overall_len = 0
for file_path in all_files:
max_len = extract_records_from_docs(file_path)
print("Got max len of ", max_len, "for file", file_path)
max_overall_len = max(max_overall_len, max_len)
print("Got maximum txt length of", max_overall_len)
if __name__ == "__main__":
main()
The default execution result is that a txt file contains multiple documents. Now, I want a txt file to only contain one document. What should I do?
The text was updated successfully, but these errors were encountered: