' in line) ^ ('
' in line)):
+ in_code_block = not in_code_block
+ if options[VERBOSE]:
+ if in_code_block:
+ print("Detected start of a codeblock, not registering titles")
+ else:
+ print("Detected end of codeblock, registering titles again")
+
+ # only split up if current line is in a fully non-os-specific section
+ if in_if_statement == 0:
+
+ title_level = check_for_title(line, in_code_block, curr_dirs, options)
+
+ # line is a title with a maximum depth of 4
+ if title_level > 0:
+ if after_first_title:
+
+ # write text of previous file
+ if previous_contained_if:
+ paragraphs_os_text[title] = current_paragraph
+ if options[VERBOSE]:
+ print("Saved os-specific chunk with temporary title: " + title + "\n")
+ else:
+ paragraphs_os_free_text[title] = current_paragraph
+ if options[VERBOSE]:
+ print("Saved generic chunk with title: " + title + "\n")
+
+ # write metadata of previous file
+ paragraphs_metadata[title] = write_metadata(main_title, title, link_list, last_title_level, last_dir, options[SOURCE_DIRECTORY] + '/' + main_title + '.md')
+
+ # make a new title
+ title = make_valid_title(line[title_level + 1:-1])
+
+ # create an entry for the file in the paragraphs text dictionary
+ current_paragraph = ""
+
+ after_first_title = True
+ subtitle_order.append(title)
+
+ # reset link_list
+ link_list = []
+
+ previous_contained_if = False
+
+ # line is not a title
+ elif after_first_title:
+ line, link_list = replace_markdown_markers(line, link_list, in_code_block, main_title, is_linux_tutorial)
+ if line != "\n":
+ current_paragraph += line
+
+ # keep track of title level and directory to write to metadata upon discovering a new subtitle
+ if title_level > 0:
+ last_title_level = title_level
+ last_dir = curr_dirs[last_title_level]
+ else:
+ previous_contained_if = True
+ line, link_list = replace_markdown_markers(line, link_list, in_code_block, main_title, is_linux_tutorial)
+ if line != "\n":
+ current_paragraph += line
+
+ # write dictionaries for the last file
+ if previous_contained_if:
+ paragraphs_os_text[title] = current_paragraph
+ if options[VERBOSE]:
+ print("Saved os-specific chunk with temporary title: " + title + "\n")
+ else:
+ paragraphs_os_free_text[title] = current_paragraph
+ if options[VERBOSE]:
+ print("Saved generic chunk with title: " + title + "\n")
+ paragraphs_metadata[title] = write_metadata(main_title, title, link_list, last_title_level, curr_dirs[last_title_level], options[SOURCE_DIRECTORY] + '/' + main_title + '.md')
+
+ return paragraphs_os_text, paragraphs_os_free_text, paragraphs_metadata, subtitle_order
+
+
+def split_on_paragraphs(file, main_title, options, is_linux_tutorial, current_paragraph_number=-1, OS=GENERIC):
+ """
+ Function that splits the text into smaller sections based on the paragraph structure and makes them into two dictionaries containing text and metadata
+
+ :param file: the filepath of the file to be split
+ :param main_title: the main title of the file
+ :param options: dictionary containing the options given by the user
+ :param is_linux_tutorial: boolean indicating whether the current file is part of the linux tutorial
+ :param current_paragraph_number: number of the paragraph that is being split, only applicable when splitting an os-specific paragraph
+ :param OS: the OS of the file to be split, only applicable when splitting an os-specific paragraph
+ :return paragraphs_text: dictionary containing the split sections of text
+ :return paragraphs_metadata: dictionary containing the metadata of each split section of text
+ :return subtitle_order: list containing all encountered subtitles in order of appearance
+ """
+
+ if options[VERBOSE]:
+ print("Splitting on paragraphs\n")
+
+ # start of assuming we are not in a code_block
+ in_code_block = False
+
+ # define initial dictionaries
+ paragraphs_os_free_text = {}
+ paragraphs_os_text = {}
+ paragraphs_metadata = {}
+
+ # variable to keep track of the current paragraph
+ current_paragraph = ""
+
+ # list to keep track of links in the text
+ link_list = []
+
+ # list to keep track of the order of the subtitles
+ subtitle_order = []
+
+ # variable to keep track of how many if-statements deep the current line is
+ in_if_statement = 0
+
+ # variable to indicate that previous section was one with if-statements
+ previous_contained_if = False
+
+ # variable to indicate that the previous line was part of a list
+ in_list = False
+
+ # paragraph number to add to title
+ paragraph_number = 1
+
+ # metadata title
+ metadata_title = main_title
+
+ # define metadata data if split occurs on paragraphs and last_title and title_level are known (will be replaced later on in the process)
+ if current_paragraph_number != -1:
+ last_title_level = 4
+ last_dir = "PLACEHOLDER"
+
+ # list to keep track of most recent directories on each title level
+ curr_dirs = [main_title for _ in range(options[MAX_TITLE_DEPTH] + 1)]
+
+ with open(file, 'r') as readfile:
+
+ # Create two independent iterators from the original file iterator (needed to check for lists)
+ current_line, next_line = tee(readfile)
+
+ # Advance the next_line iterator by one step, so it is always one step ahead
+ next(next_line, None)
+
+ # Process the lines
+ for line, nxt in zip_longest(current_line, next_line, fillvalue=""):
+
+ # detect if-statements starting or ending on the current line
+ in_if_statement += len(re.findall(IF_MANGLED_PATTERNS[IF], line)) - len(
+ re.findall(IF_MANGLED_PATTERNS[ENDIF], line))
+
+ # detect whether the current line is in a list
+ if re.search(r'^(\s*)([*+-]|\d+\.|[a-zA-Z]\.)\s+.*$', line): # beginning of a list entry
+ in_list = True
+ if options[VERBOSE]:
+ print("First line of new list entry found, not starting new paragraphs: " + line[:-1])
+ elif re.search(r'^\s{2,}.+$', line) and in_list: # middle of a list entry
+ pass
+ elif re.search(r'^(\s*)([*+-]|\d+\.|[a-zA-Z]\.)\s+.*$|^\s{2,}.+$|^\n', nxt) and in_list: # line(s) between list entries
+ pass
+ elif re.search(r'^(\s*)([*+-]|\d+\.|[a-zA-Z]\.)\s+.*$', nxt):
+ in_list = True
+ elif in_list:
+ if options[VERBOSE]:
+ print("List ended, starting new paragraphs again")
+ in_list = False
+ else:
+ in_list = False
+
+ # detect codeblocks to make sure titles aren't detected in them
+ if '```' in line or (('' in line) ^ ('
' in line)):
+ in_code_block = not in_code_block
+ if options[VERBOSE]:
+ if in_code_block:
+ print("Detected start of a codeblock, not starting new paragraphs")
+ else:
+ print("Detected end of codeblock, starting new paragraphs again")
+
+ # only split up if current line is in a fully non-os-specific section
+ if in_if_statement == 0:
+
+ title_level = check_for_title(line, in_code_block, curr_dirs, options)
+
+ # check whether a new paragraph should be started
+ if line == "\n" and paragraph_long_enough(re.sub(r'\{' + IF_MANGLED_PART + '%.*?%' + IF_MANGLED_PART + '}', "", current_paragraph), options) and not in_code_block and not in_list:
+
+ # create a title for the previous paragraph
+ if current_paragraph_number == -1:
+ paragraph_title = main_title + _PARAGRAPH_ + f"{paragraph_number:03}"
+ else:
+ paragraph_title = main_title + "_" + OS + _PARAGRAPH_ + f"{current_paragraph_number:03}.{paragraph_number:03}"
+ paragraph_number += 1
+
+ # write text of previous file
+ if previous_contained_if:
+ paragraphs_os_text[paragraph_title] = current_paragraph
+ if options[VERBOSE]:
+ print("Saved os-specific chunk with temporary title: " + paragraph_title + "\n")
+ else:
+ paragraphs_os_free_text[paragraph_title] = current_paragraph
+ if options[VERBOSE]:
+ print("Saved generic chunk with title: " + paragraph_title + "\n")
+
+ # write metadata of previous file
+ paragraphs_metadata[paragraph_title] = write_metadata(main_title, metadata_title, link_list, last_title_level, last_dir, source_file=options[SOURCE_DIRECTORY] + '/' + main_title + '.md')
+ subtitle_order.append(paragraph_title)
+
+ # reset the current paragraph
+ current_paragraph = ""
+
+ # reset link_list
+ link_list = []
+
+ previous_contained_if = False
+
+ # line is a title with a maximum depth of 4
+ elif title_level > 0:
+
+ # make a new title
+ metadata_title = make_valid_title(line[title_level + 1:-1])
+
+ line, link_list = replace_markdown_markers(line[title_level + 1:], link_list, in_code_block, main_title, is_linux_tutorial)
+ current_paragraph += line
+
+ # line is not a title or the beginning of a new paragraph
+ elif line != "\n" or previous_contained_if:
+ line, link_list = replace_markdown_markers(line, link_list, in_code_block, main_title, is_linux_tutorial)
+ current_paragraph += line
+
+ # keep track of title level and directory to write to metadata upon discovering a new subtitle
+ if title_level > 0:
+ last_title_level = title_level
+ last_dir = curr_dirs[last_title_level]
+ else:
+ previous_contained_if = True
+ line, link_list = replace_markdown_markers(line, link_list, in_code_block, main_title, is_linux_tutorial)
+ current_paragraph += line
+
+ # create a title for the last paragraph
+ if current_paragraph_number == -1:
+ paragraph_title = main_title + _PARAGRAPH_ + f"{paragraph_number:03}"
+ else:
+ paragraph_title = main_title + "_" + OS + _PARAGRAPH_ + f"{current_paragraph_number:03}.{paragraph_number:03}"
+
+ # write dictionaries for the last file
+ if previous_contained_if:
+ paragraphs_os_text[paragraph_title] = current_paragraph
+ if options[VERBOSE]:
+ print("Saved os-specific chunk with temporary title: " + paragraph_title + "\n")
+ else:
+ paragraphs_os_free_text[paragraph_title] = current_paragraph
+ if options[VERBOSE]:
+ print("Saved generic chunk with title: " + paragraph_title + "\n")
+ paragraphs_metadata[paragraph_title] = write_metadata(main_title, metadata_title, link_list, last_title_level, curr_dirs[last_title_level], source_file=options[SOURCE_DIRECTORY] + '/' + main_title + '.md')
+ subtitle_order.append(paragraph_title)
+
+ return paragraphs_os_text, paragraphs_os_free_text, paragraphs_metadata, subtitle_order
+
+
+def paragraph_long_enough(paragraph, options):
+ """
+ Function that checks if the paragraph is long enough to be split of
+
+ :param paragraph: current paragraph
+ :param options: dictionary containing the options given by the user
+ :return:
+ """
+ encoding = tiktoken.get_encoding("cl100k_base")
+ token_amount = len(encoding.encode(paragraph))
+
+ return token_amount >= options[MIN_PARAGRAPH_LENGTH]
+
+
+def write_metadata(main_title, subtitle, links, title_level, directory, source_file):
+ """
+ Function that writes metadata about a text section to a dictionary
+
+ :param main_title: The main title of the file containing the section
+ :param subtitle: the title of the section
+ :param links: a list of links contained within the section
+ :param title_level: the depth of the title of the section
+ :param directory: the directory where the section will eventually be written (can either be generic or os-specific)
+ :param source_file: the source file that the section originates from
+ :return paragraph_metadata: dictionary containing the metadata about the section
+ """
+
+ paragraph_metadata = {MAIN_TITLE: main_title, SUBTITLE: subtitle, SOURCE_FILE: source_file, TITLE_DEPTH: title_level, DIRECTORY: Path(directory).as_posix()}
+
+ if len(links) > 0:
+ paragraph_metadata[LINKS] = {}
+ for i, link in enumerate(links):
+ paragraph_metadata[LINKS][str(i)] = link
+
+ paragraph_metadata[PARENT_TITLE] = Path(directory).parent.name
+
+ return paragraph_metadata
+
+
+def jinja_parser(filename, copy_location, options):
+ """
+ function that let's jinja do its thing to format the files except for the os-related if-statements
+
+ :param filename: the name of the file that needs to be formatted using jinja
+ :param copy_location: the location of the file that needs to be formatted using jinja
+ :param options: dictionary containing the options given by the user
+ :return:
+ """
+ # YAML file location
+ yml_file_path = os.path.join(RETURN_DIR, RETURN_DIR, MKDOCS_DIR, EXTRA_DIR, 'gent.yml')
+
+ if options[VERBOSE]:
+ print("Reading YAML file from location: " + yml_file_path)
+
+ # Read the YAML file
+ with open(yml_file_path, 'r') as yml_file:
+ words_dict = yaml.safe_load(yml_file)
+
+ # ugly fix for index.md error that occurs because of the macro "config.repo_url" in mkdocs/docs/HPC/index.md
+ additional_context = {
+ 'config': {
+ 'repo_url': REPO_URL
+ }
+ }
+ combined_context = {**words_dict, **additional_context}
+
+ if options[VERBOSE]:
+ print("Mangling OS-specific if-statements")
+
+ # Mangle the OS-related if-statements
+ mangle_ifs(copy_location, filename, options)
+
+ if options[VERBOSE]:
+ print("Altering other if-statements to parse properly")
+
+ # Alter the other if-statements
+ alter_ifs(filename, options)
+
+ # Use Jinja2 to replace the macros
+ template_loader = ChoiceLoader([FileSystemLoader(searchpath=[os.path.join(options[DESTINATION_DIRECTORY], IF_MANGLED_FILES), options[SOURCE_DIRECTORY], os.path.join(options[SOURCE_DIRECTORY], RETURN_DIR)]), FunctionLoader(load_macros)])
+ templateEnv = Environment(loader=template_loader)
+ template = templateEnv.get_template(filename)
+ rendered_content = template.render(combined_context)
+
+ if options[VERBOSE]:
+ print("jinja parsing finished\nWriting jinja-parsed file to location: " + copy_location)
+
+ # Save the rendered content to a new file
+ with open(copy_location, 'w', encoding='utf-8', errors='ignore') as output_file:
+ output_file.write(rendered_content)
+
+
+def load_macros(name):
+ """
+ function used by the jinja FunctionLoader to retrieve templates from the macros folder since the normal FileSystemLoader can't locate them properly
+
+ :param name: name of the package
+ :return:
+ """
+
+ macros_location = os.path.join(RETURN_DIR, RETURN_DIR, MKDOCS_DIR, DOCS_DIR, MACROS)
+
+ if "../" + MACROS + "/" in name:
+ package_name = name.split("../" + MACROS + "/")[1]
+ file_location = os.path.join(macros_location, package_name)
+
+ with open(file_location, 'r') as readfile:
+ return readfile.read()
+
+
+def mangle_os_ifs(line, is_os, options):
+ """
+ function that mangles the os-related if-statements. This is needed because we want to keep these if-statements intact after jinja-parsing to build the directory structure.
+ We don't want to mangle all if-related statements (such as else and endif) so we need to keep track of the context of the last few if-statements.
+
+ :param line: the current line to check for os-related if-statements
+ :param is_os: variable keep track of the current os-state of the if-statements. Can be NON_OS_IF, NON_OS_IF_IN_OS_IF, OS_IF or OS_IF_IN_OS_IF
+ NON_OS_IF: not in an os-if
+ NON_OS_IF_IN_OS_IF: in a non-os-if nested in an os-if
+ OS_IF: in an os-if
+ OS_IF_IN_OS_IF: in an os-if nested in an os-if
+ :param options: dictionary containing the options given by the user
+ :return line: the modified line with mangled os-related if-statements
+ """
+
+ match = re.search(r'\{%(.*?)%}(.*)', line)
+
+ start_index = 0
+ added_length = 0
+
+ while match:
+
+ constr_match = re.search(r'\{%.*?%}', match.string)
+ if_match = re.search(r'if ', match.group(1))
+ if_os_match = re.search(r'if OS', match.group(1))
+ endif_match = re.search(r'endif', match.group(1))
+ else_match = re.search(r'else', match.group(1))
+
+ # mangle positions
+ pos_first_mangle = constr_match.start() + start_index + added_length + 1
+ pos_second_mangle = constr_match.end() + start_index + added_length - 1
+
+ # different parts of the original string
+ part_before_mangling = line[:pos_first_mangle]
+ part_between_mangling = line[pos_first_mangle:pos_second_mangle]
+ part_after_mangling = line[pos_second_mangle:]
+
+ # this logic isn't flawless, there are number of nested if-constructions that are technically possible that would break this logic, but these don't appear in the documentation as it doesn't make sense to have these
+ if endif_match:
+ if is_os in (OS_IF, OS_IF_IN_OS_IF):
+ if options[VERBOSE]:
+ print("OS-specific endif statement found in line: " + line[:-1])
+ line = part_before_mangling + IF_MANGLED_PART + part_between_mangling + IF_MANGLED_PART + part_after_mangling
+ added_length += 2 * len(IF_MANGLED_PART)
+ if is_os == OS_IF:
+ is_os = NON_OS_IF
+ elif is_os == OS_IF_IN_OS_IF:
+ is_os = OS_IF
+ elif is_os == NON_OS_IF_IN_OS_IF:
+ is_os = OS_IF
+
+ elif if_match:
+ if if_os_match:
+ if options[VERBOSE]:
+ print("OS-specific if statement found in line: " + line[:-1])
+ line = part_before_mangling + IF_MANGLED_PART + part_between_mangling + IF_MANGLED_PART + part_after_mangling
+ added_length += 2 * len(IF_MANGLED_PART)
+ if is_os == OS_IF:
+ is_os = OS_IF_IN_OS_IF
+ else:
+ is_os = OS_IF
+ else:
+ if is_os == OS_IF:
+ is_os = NON_OS_IF_IN_OS_IF
+ else:
+ is_os = NON_OS_IF
+
+ elif else_match:
+ if is_os in (OS_IF, OS_IF_IN_OS_IF):
+ if options[VERBOSE]:
+ print("OS-specific else statement found in line: " + line[:-1])
+ line = part_before_mangling + IF_MANGLED_PART + part_between_mangling + IF_MANGLED_PART + part_after_mangling
+ added_length += 2 * len(IF_MANGLED_PART)
+
+ start_index += constr_match.end()
+ match = re.search(r'\{%(.*?)%}(.*)', match.group(2))
+ return line, is_os
+
+
+def mangle_ifs(directory, filename, options):
+ """
+ function that writes the if-mangled version of a file to a location where the jinja parser will use it
+
+ :param directory: the directory of the file to be if mangled
+ :param filename: the filename of the file to be mangled
+ :param options: dictionary containing the options given by the user
+ :return:
+ """
+ # variable to keep track of latest if-statement scope
+ is_os = NON_OS_IF
+
+ with open(os.path.join(options[DESTINATION_DIRECTORY], IF_MANGLED_FILES, filename), 'w') as write_file:
+ with open(directory, 'r') as read_file:
+ for line in read_file:
+ new_line, is_os = mangle_os_ifs(line, is_os, options)
+ write_file.write(new_line)
+
+
+def alter_ifs(filename, options):
+ """
+ Function that further adapts the if-statements in a file and writes it to a location where the jinja parser will use it.
+ This is because the jinja parser doesn't seem to be able to handle statements like {% site == gent %} with context {'site': 'Gent'} in this case.
+ These statements get changed to {% site == 'Gent' %} in this function.
+
+ :param filename: the filename of the file to be transformed
+ :param options: dictionary containing the options given by the user
+ :return:
+ """
+
+ with open(os.path.join(options[DESTINATION_DIRECTORY], IF_MANGLED_FILES, filename), 'r') as read_file:
+ content = read_file.read()
+
+ pattern = r'(\{%-?\s?[a-zA-Z\s]*?[!=]=\s?\(?)([a-zA-Z\s]+(?:\sor\s[a-zA-Z\s]+)*)(\)?\s?%})'
+ content = re.sub(pattern,
+ lambda match: (f"{match.group(1)}" +
+ " or ".join([f"'{city.strip().capitalize()}'" for city in match.group(2).split(" or ")]) +
+ f"{match.group(3)}"
+ ),
+ content)
+
+ with open(os.path.join(options[DESTINATION_DIRECTORY], IF_MANGLED_FILES, filename), 'w') as write_file:
+ write_file.write(content)
+
+
+def make_valid_title(title):
+ """
+ function that makes sure all titles can be used as valid filenames
+
+ :param title: the string that will be used as title and filename
+ :return valid_filename: the adapted title that can be used as filename
+ """
+ # Define a regex pattern for invalid characters on both Windows and Linux
+ invalid_chars = r'[<>:"/\\|?*\0]'
+
+ # get rid of extra information between {} brackets
+ title = re.sub(r'\{.*?}', '', title)
+
+ # Remove invalid characters
+ valid_filename = re.sub(invalid_chars, '', title)
+
+ # Strip leading/trailing whitespace
+ valid_filename = valid_filename.strip().strip('-').replace(' ', '-').replace("--", "-")
+
+ return valid_filename
+
+
+def write_generic_file(title, paragraphs_text, paragraphs_metadata, title_order, title_order_number, options, is_linux_tutorial):
+ """
+ Function that writes text and metadata of a generic (non-os-specific) file
+
+ :param title: title of section
+ :param paragraphs_text: dictionary containing all paragraphs of text
+ :param paragraphs_metadata: dictionary containing the metadata for all paragraphs of text
+ :param title_order: list containing all subtitles in order
+ :param title_order_number: order number of the title of the section that is being written
+ :param options: dictionary containing the options given by the user
+ :param is_linux_tutorial: boolean indicating whether the current file is part of the linux tutorial
+ :return:
+ """
+
+ if len(paragraphs_text[title]) > 0:
+ # make the directory needed for the files that will be written
+ filepath = os.path.join(options[DESTINATION_DIRECTORY], PARSED_MDS, GENERIC_DIR, paragraphs_metadata[title][DIRECTORY])
+ os.makedirs(filepath, exist_ok=True)
+
+ if options[VERBOSE]:
+ print("Writing generic section " + title + " to filepath: " + str(filepath))
+
+ write_files(title, paragraphs_text[title], paragraphs_metadata, title_order, title_order_number, filepath, GENERIC, options, is_linux_tutorial)
+ else:
+ # don't write empty files
+ pass
+
+
+def write_files(title, text, paragraphs_metadata, title_order, title_order_number, filepath, OS, options, is_linux_tutorial):
+ """
+ Function to write files to a certain filepath
+
+ :param title: title of the section to be written
+ :param text: section of text to be written
+ :param paragraphs_metadata: dictionary containing the metadata for all paragraphs of text
+ :param title_order: list containing all subtitles in order
+ :param title_order_number: order number of the title of the section that is being written
+ :param filepath: filepath to write files to
+ :param OS: OS to be included in the metadata
+ :param options: dictionary containing the options given by the user
+ :param is_linux_tutorial: boolean indicating whether the current file is part of the linux tutorial
+ :return:
+ """
+
+ metadata = copy.deepcopy(paragraphs_metadata[title])
+
+ file_title = title
+
+ # write text file
+ with open(os.path.join(filepath, file_title + ".txt"), 'w') as writefile:
+ if LINKS in paragraphs_metadata[title].keys():
+ adapted_text, metadata[LINKS] = insert_links(text, metadata[LINKS], options)
+ writefile.write(adapted_text)
+ else:
+ writefile.write(text)
+
+ # write metadata
+ # check if links in metadata is not empty
+ if LINKS in metadata.keys() and len(metadata[LINKS].keys()) == 0:
+ del metadata[LINKS]
+
+ # add previous subtitle
+ if title_order_number != 0:
+ metadata[PREVIOUS_SUBTITLE] = title_order[title_order_number - 1]
+ else:
+ metadata[PREVIOUS_SUBTITLE] = None
+
+ # add next subtitle
+ if title_order_number != len(title_order) - 1:
+ metadata[NEXT_SUBTITLE] = title_order[title_order_number + 1]
+ else:
+ metadata[NEXT_SUBTITLE] = None
+
+ # add OS
+ metadata[METADATA_OS] = OS
+
+ # add reference link
+ if is_linux_tutorial:
+ linux_part = LINUX_TUTORIAL + "/"
+ else:
+ linux_part = ""
+ if OS == GENERIC:
+ os_part = ""
+ else:
+ os_part = LINK_OS[OS] + "/"
+ if "index" not in paragraphs_metadata[title][MAIN_TITLE]:
+ metadata[REFERENCE_LINK] = DOCS_URL + "/" + os_part + linux_part + paragraphs_metadata[title][MAIN_TITLE] + "/#" + ''.join(char.lower() for char in paragraphs_metadata[title][SUBTITLE] if char.isalnum() or char == '-').strip('-')
+ else:
+ metadata[REFERENCE_LINK] = DOCS_URL
+
+ # write metadata to file
+ with open(os.path.join(filepath, file_title + METADATA_EXTENSION + ".json"), 'w') as writefile:
+ json.dump(metadata, writefile, indent=4)
+
+
+def insert_links(text, links, options):
+ """
+ Function that inserts links in the plaintext or takes out the references to the links depending on the value of INCLUDE_LINKS_IN_PLAINTEXT
+
+ :param text: The plaintext that needs to be adapted
+ :param links: The links that might need to be inserted
+ :param options: dictionary containing the options given by the user
+ :return text: The adapted plaintext
+ :return links: The links that were actually present in the text
+ """
+
+ present_links = []
+ new_links = {}
+ for link_number in re.finditer(LINK_MARKER + r'([0-9]*?)' + LINK_MARKER, text):
+ present_links.append(link_number.group(1))
+ if options[INCLUDE_LINKS_IN_PLAINTEXT]:
+ text = re.sub(LINK_MARKER + link_number.group(1) + LINK_MARKER, " " + links[link_number.group(1)] + " ", text)
+ else:
+ text = re.sub(LINK_MARKER + link_number.group(1) + LINK_MARKER, "", text)
+
+ for link_number in links.keys():
+ if link_number in present_links:
+ new_links[str(len(new_links.keys()))] = links[link_number]
+
+ return text, new_links
+
+
+def split_and_write_os_specific_section(text, metadata, subtitle_order, title_order_number, all_metadata, options, is_linux_tutorial):
+ """
+ Function that splits os-specific sections into subtitles, parses them using jinja and writes them away
+
+ :param text: full os specific section
+ :param metadata: metadata generated for the full os specific section
+ :param subtitle_order: order of the subtitles generated by the splitter
+ :param title_order_number: order number of the section
+ :param all_metadata: all metadata generated by the splitter
+ :param options: dictionary containing the options given by the user
+ :param is_linux_tutorial: boolean indicating whether the current file is part of the linux tutorial
+ :return:
+ """
+
+ # Unmangle if's to use jinja parser
+ text = re.sub(IF_MANGLED_PART, "", text)
+
+ for OS in [LINUX, WINDOWS, MACOS]:
+
+ # slightly alter if-statements to be able to use predefined macros
+ text = re.sub(OS, '"' + OS + '"', text)
+
+ # Use jinja to render a different version of the text for each OS
+ template = Template(text)
+ jinja_text = template.render(OS=OS)
+
+ if len(jinja_text) != 0:
+
+ # add first subtitle in front of section again
+ if options[SPLIT_ON_TITLES] or metadata[SUBTITLE] not in make_valid_title(jinja_text[:len(metadata[SUBTITLE]) + 1]):
+ jinja_text = "#" * metadata[TITLE_DEPTH] + " " + metadata[SUBTITLE].replace("-", " ") + "\n" + jinja_text
+ else:
+ jinja_text = "#" * metadata[TITLE_DEPTH] + " " + jinja_text
+
+ # re-adjust text to correct overcorrections
+ jinja_text = re.sub('"' + OS + '"', OS, jinja_text)
+
+ with open(TEMP_JINJA_FILE, 'w') as writefile:
+ writefile.write(jinja_text)
+
+ # split in right way
+ _, os_specific_text, os_specific_metadata, os_subtitle_order = split_text(TEMP_JINJA_FILE, metadata[MAIN_TITLE], options, is_linux_tutorial, current_paragraph_number=subtitle_order[title_order_number].split('_')[-1], OS=OS)
+
+ # prepare variables to fix metadata
+ total_subtitle_order = subtitle_order[:title_order_number] + os_subtitle_order + subtitle_order[title_order_number+1:]
+ all_metadata.update(os_specific_metadata)
+
+ # write to files
+ for os_i, os_subtitle in enumerate(os_subtitle_order):
+ # check that file actually has some content
+ if len(os_specific_text[os_subtitle]) > 0:
+ # add the links to the metadata
+ if LINKS in metadata.keys():
+ os_specific_metadata[os_subtitle][LINKS] = metadata[LINKS]
+
+ # fix parent in the metadata
+ parent_i = 0
+ parent_depth = os_specific_metadata[os_subtitle][TITLE_DEPTH] - 1
+ parent = os_specific_metadata[os_subtitle][MAIN_TITLE]
+
+ while total_subtitle_order[parent_i] != os_subtitle and parent_i != len(total_subtitle_order):
+ if all_metadata[total_subtitle_order[parent_i]][TITLE_DEPTH] == parent_depth:
+ parent = total_subtitle_order[parent_i]
+ parent_i += 1
+
+ if options[SPLIT_ON_PARAGRAPHS] and parent != os_specific_metadata[os_subtitle][MAIN_TITLE]:
+ os_specific_metadata[os_subtitle][PARENT_TITLE] = all_metadata[parent][SUBTITLE]
+ else:
+ os_specific_metadata[os_subtitle][PARENT_TITLE] = parent
+
+ # fix directory in the metadata if needed
+ if options[DEEP_DIRECTORIES]:
+ if parent == os_specific_metadata[os_subtitle][MAIN_TITLE]:
+ os_specific_metadata[os_subtitle][DIRECTORY] = os.path.join(parent, os_specific_metadata[os_subtitle][SUBTITLE])
+ else:
+ os_specific_metadata[os_subtitle][DIRECTORY] = os.path.join(all_metadata[parent][DIRECTORY], os_specific_metadata[os_subtitle][SUBTITLE])
+ os_specific_metadata[os_subtitle][DIRECTORY] = Path(os_specific_metadata[os_subtitle][DIRECTORY]).as_posix()
+
+ # make a directory to save the files
+ filepath = os.path.join(options[DESTINATION_DIRECTORY], PARSED_MDS, OS_SPECIFIC_DIR, OS, os_specific_metadata[os_subtitle][DIRECTORY])
+ os.makedirs(filepath, exist_ok=True)
+
+ if options[VERBOSE]:
+ print("Writing os-specific section " + os_subtitle + " to filepath: " + str(filepath))
+
+ # write to files
+ write_files(os_subtitle, os_specific_text[os_subtitle], os_specific_metadata, total_subtitle_order, os_i + title_order_number, filepath, OS, options, is_linux_tutorial)
+ else:
+ # don't write empty files
+ pass
+ else:
+ # don't split empty texts
+ pass
+
+
+def main(options):
+ """
+ main function
+
+ :param options: dictionary containing the options specified by the user to run the script:
+ {SOURCE_DIRECTORY: The source directory where the original files are located,
+ DESTINATION_DIRECTORY: The destination directory where the processed files should be written to,
+ SPLIT_ON_TITLES: boolean indicating whether to split on titles,
+ SPLIT_ON_PARAGRAPHS: boolean indicating whether to split on paragraphs (should always be the opposite of SPLIT_ON_TITLES),
+ MIN_PARAGRAPH_LENGTH: integer representing the minimum length of a paragraph,
+ MAX_TITLE_DEPTH: integer representing the maximum depth of a title for it to be used when splitting the text,
+ INCLUDE_LINKS_IN_PLAINTEXT: boolean indicating whether links should be included in the plaintext,
+ DEEP_DIRECTORIES: boolean indicating whether the generated directories should be nested by title-structure or not,
+ VERBOSE: enable or disable verbose mode}
+ :return:
+ """
+
+ if options[VERBOSE]:
+ print("Running chatbot parser with options: " + str(options))
+
+ if options[DEEP_DIRECTORIES] and options[VERBOSE]:
+ print("WARNING: This script generates a file structure that contains rather long filepaths. Depending on where the script is ran, some of these paths might exceed the maximum length allowed by the system resulting in problems opening the files.")
+
+ # remove the directories from a previous run of the parser if they weren't cleaned up properly for some reason
+ shutil.rmtree(os.path.join(options[DESTINATION_DIRECTORY], PARSED_MDS), ignore_errors=True)
+ shutil.rmtree(os.path.join(options[DESTINATION_DIRECTORY], COPIES), ignore_errors=True)
+ shutil.rmtree(os.path.join(options[DESTINATION_DIRECTORY], IF_MANGLED_FILES), ignore_errors=True)
+
+ # make the necessary directories
+ for directory in [COPIES, PARSED_MDS, IF_MANGLED_FILES]:
+ directory = os.path.join(options[DESTINATION_DIRECTORY], directory)
+ if not os.path.exists(directory):
+ os.makedirs(directory)
+
+ ################### define loop-invariant variables ###################
+
+ # constant that keeps track of the source directory
+ source_directory = options[SOURCE_DIRECTORY]
+
+ # list of all the filenames
+ filenames = {}
+ all_items = os.listdir(source_directory)
+ files = [f for f in all_items if os.path.isfile(os.path.join(source_directory, f)) and ".md" in f[-3:]]
+ for file in files:
+ filenames[file] = os.path.join(source_directory, file)
+
+ # for loops over all files
+ for filename in filenames.keys():
+ ################### define/reset loop specific variables ###################
+
+ # boolean indicating whether the current file is part of the linux tutorial
+ is_linux_tutorial = bool(LINUX_TUTORIAL in filenames[filename])
+
+ # make a copy of the original file in order to make sure the original does not get altered
+ copy_file = os.path.join(options[DESTINATION_DIRECTORY], COPIES, filename)
+ shutil.copyfile(filenames[filename], copy_file)
+
+ # variable that keeps track of the directories that are used to write in at different levels
+ root_dir_generic = os.path.join(options[DESTINATION_DIRECTORY], PARSED_MDS, GENERIC_DIR)
+ root_dir_os_specific = os.path.join(options[DESTINATION_DIRECTORY], PARSED_MDS, OS_SPECIFIC_DIR)
+ root_dir_os_specific_linux = os.path.join(root_dir_os_specific, LINUX)
+ root_dir_os_specific_windows = os.path.join(root_dir_os_specific, WINDOWS)
+ root_dir_os_specific_macos = os.path.join(root_dir_os_specific, MACOS)
+
+ # variable for the main title (needed for reference links)
+ main_title = filename[:-3]
+
+ # variable that keeps track of the directories that are used to write in at different levels
+ curr_dirs = [filename[:-3] for _ in range(options[MAX_TITLE_DEPTH] + 1)]
+
+ ################### actually parse the md file ###################
+
+ if options[VERBOSE]:
+ print(LINE + "Processing " + filename)
+ print("Location: " + filenames[filename])
+ print("\nMaking directories:")
+
+ # create directories for the source markdown file
+ for directory in [root_dir_generic, root_dir_os_specific, root_dir_os_specific_linux, root_dir_os_specific_windows, root_dir_os_specific_macos, os.path.join(root_dir_generic, curr_dirs[0]), os.path.join(root_dir_os_specific_linux, curr_dirs[0]), os.path.join(root_dir_os_specific_windows, curr_dirs[0]), os.path.join(root_dir_os_specific_macos, curr_dirs[0])]:
+ if options[VERBOSE]:
+ print(directory)
+ os.makedirs(directory, exist_ok=True)
+
+ if options[VERBOSE]:
+ print("\nParsing the sourcefile with jinja")
+
+ # process the jinja macros
+ jinja_parser(filename, copy_file, options)
+
+ if options[VERBOSE]:
+ print("\nSplitting the file for the first time (split in sufficiently small generic sections and large os-specific chunks)")
+
+ # split the text in paragraphs
+ paragraphs_os_text, paragraphs_os_free_text, paragraphs_metadata, subtitle_order = split_text(copy_file, main_title, options, is_linux_tutorial)
+
+ if options[VERBOSE]:
+ print("\nFurther splitting os-specific chunks and writing generic and os-specific sections to files with metadata")
+
+ # for every section, either make the whole section generic, or create an os-specific file for each OS
+ for i, subtitle in enumerate(subtitle_order):
+
+ # generic
+ if subtitle in paragraphs_os_free_text.keys():
+ write_generic_file(subtitle, paragraphs_os_free_text, paragraphs_metadata, subtitle_order, i, options, is_linux_tutorial)
+
+ # os-specific
+ else:
+ split_and_write_os_specific_section(paragraphs_os_text[subtitle], paragraphs_metadata[subtitle], subtitle_order, i, paragraphs_metadata, options, is_linux_tutorial)
+
+ if options[VERBOSE]:
+ print("\nFinished processing " + filename)
+
+ if options[VERBOSE]:
+ print(LINE + "Cleaning up directories:")
+ print(os.path.join(options[DESTINATION_DIRECTORY], COPIES))
+ print(os.path.join(options[DESTINATION_DIRECTORY], IF_MANGLED_FILES))
+ print(os.path.join(options[DESTINATION_DIRECTORY], LINUX_TUTORIAL))
+ # clean up temporary directories and files
+ shutil.rmtree(os.path.join(options[DESTINATION_DIRECTORY], COPIES), ignore_errors=True)
+ shutil.rmtree(os.path.join(options[DESTINATION_DIRECTORY], IF_MANGLED_FILES), ignore_errors=True)
+ shutil.rmtree(os.path.join(options[DESTINATION_DIRECTORY], LINUX_TUTORIAL), ignore_errors=True)
+ if os.path.exists(TEMP_JINJA_FILE):
+ os.remove(TEMP_JINJA_FILE)
+
+ if options[VERBOSE]:
+ print("Parsing finished successfully")
+
+
+################### run the script ###################
+if __name__ == '__main__':
+ parser = argparse.ArgumentParser(description="Preprocessing script for the chatbot\n")
+
+ # adding command-line options
+ parser.add_argument("-src", "--source", required=True, type=str, help="The source directory where the original files are located")
+ parser.add_argument("-dst", "--destination", required=True, type=str, help="The destination directory where the processed files should be written to")
+ parser.add_argument("-st", "--split_on_titles", action="store_true", help="Splits the text based on titles and subtitles instead of paragraphs with a minimum length.")
+ parser.add_argument("-pl", "--min_paragraph_length", type=int, default=512, help="Minimum length in characters of a paragraph, only works if split on titles is disabled (default: 683)")
+ parser.add_argument("-td", "--max_title_depth", type=int, default=4, help="Maximum depth of titles that divide the source text into sections, only works if split on titles is enabled (default: 4)")
+ parser.add_argument("-l", "--links", action="store_true", help="Add links to the output texts")
+ parser.add_argument("-dd", "--deep_directories", action="store_true", help="Generate a nested directory structure following the structure of the subtitles. Only works if split on titles is enabled")
+ parser.add_argument("-v", "--verbose", action="store_true", help="Run the script with verbose output")
+
+ args = parser.parse_args()
+
+ options_dict = {SOURCE_DIRECTORY: args.source,
+ DESTINATION_DIRECTORY: args.destination,
+ SPLIT_ON_TITLES: args.split_on_titles,
+ SPLIT_ON_PARAGRAPHS: not args.split_on_titles,
+ MIN_PARAGRAPH_LENGTH: args.min_paragraph_length,
+ MAX_TITLE_DEPTH: args.max_title_depth,
+ INCLUDE_LINKS_IN_PLAINTEXT: args.links,
+ DEEP_DIRECTORIES: args.deep_directories and args.split_on_titles,
+ VERBOSE: args.verbose}
+
+ main(options_dict)
diff --git a/scripts/HPC_chatbot_preprocessor/requirements.txt b/scripts/HPC_chatbot_preprocessor/requirements.txt
new file mode 100644
index 00000000000..1b9cb4a5052
--- /dev/null
+++ b/scripts/HPC_chatbot_preprocessor/requirements.txt
@@ -0,0 +1,5 @@
+PyYAML==6.0.2
+Jinja2==3.1.4
+tiktoken~=0.7.0
+pathlib~=1.0.1
+pytest
\ No newline at end of file
diff --git a/scripts/HPC_chatbot_preprocessor/tests/test_files/ftps/output/parsed_mds/generic/tps1/tps1_paragraph_001.txt b/scripts/HPC_chatbot_preprocessor/tests/test_files/ftps/output/parsed_mds/generic/tps1/tps1_paragraph_001.txt
new file mode 100644
index 00000000000..94270ff37e3
--- /dev/null
+++ b/scripts/HPC_chatbot_preprocessor/tests/test_files/ftps/output/parsed_mds/generic/tps1/tps1_paragraph_001.txt
@@ -0,0 +1,6 @@
+Main title
+This is the first paragraph of text. It is non-os-specific, however it does contain a link.
+It also contains some other Markdown syntax and an
+example code block.
+This intro needs to be sufficiently long as will be explained in the following section (we want to hit the minimum
+character limit for a section).
diff --git a/scripts/HPC_chatbot_preprocessor/tests/test_files/ftps/output/parsed_mds/generic/tps1/tps1_paragraph_001_metadata.json b/scripts/HPC_chatbot_preprocessor/tests/test_files/ftps/output/parsed_mds/generic/tps1/tps1_paragraph_001_metadata.json
new file mode 100644
index 00000000000..31cbf626d8d
--- /dev/null
+++ b/scripts/HPC_chatbot_preprocessor/tests/test_files/ftps/output/parsed_mds/generic/tps1/tps1_paragraph_001_metadata.json
@@ -0,0 +1,15 @@
+{
+ "main_title": "tps1",
+ "subtitle": "Main-title",
+ "source_file": "tests/test_files/ftps/tps1.md",
+ "title_depth": 1,
+ "directory": "tps1",
+ "links": {
+ "0": "https://docs.hpc.ugent.be/generic"
+ },
+ "parent_title": "",
+ "previous_title": null,
+ "next_title": "tps1_paragraph_002",
+ "OS": "generic",
+ "reference_link": "https://docs.hpc.ugent.be/tps1/#main-title"
+}
\ No newline at end of file
diff --git a/scripts/HPC_chatbot_preprocessor/tests/test_files/ftps/output/parsed_mds/generic/tps1/tps1_paragraph_003.txt b/scripts/HPC_chatbot_preprocessor/tests/test_files/ftps/output/parsed_mds/generic/tps1/tps1_paragraph_003.txt
new file mode 100644
index 00000000000..58eedc06aa0
--- /dev/null
+++ b/scripts/HPC_chatbot_preprocessor/tests/test_files/ftps/output/parsed_mds/generic/tps1/tps1_paragraph_003.txt
@@ -0,0 +1,3 @@
+Conclusion
+Coming up with what to write in test texts is very hard. I think I got the most important test cases in there, but I
+might add to this if needed.
\ No newline at end of file
diff --git a/scripts/HPC_chatbot_preprocessor/tests/test_files/ftps/output/parsed_mds/generic/tps1/tps1_paragraph_003_metadata.json b/scripts/HPC_chatbot_preprocessor/tests/test_files/ftps/output/parsed_mds/generic/tps1/tps1_paragraph_003_metadata.json
new file mode 100644
index 00000000000..cc7b47a8b5a
--- /dev/null
+++ b/scripts/HPC_chatbot_preprocessor/tests/test_files/ftps/output/parsed_mds/generic/tps1/tps1_paragraph_003_metadata.json
@@ -0,0 +1,12 @@
+{
+ "main_title": "tps1",
+ "subtitle": "Conclusion",
+ "source_file": "tests/test_files/ftps/tps1.md",
+ "title_depth": 2,
+ "directory": "tps1",
+ "parent_title": "",
+ "previous_title": "tps1_paragraph_002",
+ "next_title": null,
+ "OS": "generic",
+ "reference_link": "https://docs.hpc.ugent.be/tps1/#conclusion"
+}
\ No newline at end of file
diff --git a/scripts/HPC_chatbot_preprocessor/tests/test_files/ftps/output/parsed_mds/os_specific/linux/tps1/tps1_linux_paragraph_002.001.txt b/scripts/HPC_chatbot_preprocessor/tests/test_files/ftps/output/parsed_mds/os_specific/linux/tps1/tps1_linux_paragraph_002.001.txt
new file mode 100644
index 00000000000..d0ee9ce8256
--- /dev/null
+++ b/scripts/HPC_chatbot_preprocessor/tests/test_files/ftps/output/parsed_mds/os_specific/linux/tps1/tps1_linux_paragraph_002.001.txt
@@ -0,0 +1,4 @@
+OS specific sections
+This is the second section, it is the start of some
+text specific to OSes that aren't windows. I feel like there is no need to make this section very long, however I will
+still add a link.
diff --git a/scripts/HPC_chatbot_preprocessor/tests/test_files/ftps/output/parsed_mds/os_specific/linux/tps1/tps1_linux_paragraph_002.001_metadata.json b/scripts/HPC_chatbot_preprocessor/tests/test_files/ftps/output/parsed_mds/os_specific/linux/tps1/tps1_linux_paragraph_002.001_metadata.json
new file mode 100644
index 00000000000..fb165c8e7fc
--- /dev/null
+++ b/scripts/HPC_chatbot_preprocessor/tests/test_files/ftps/output/parsed_mds/os_specific/linux/tps1/tps1_linux_paragraph_002.001_metadata.json
@@ -0,0 +1,15 @@
+{
+ "main_title": "tps1",
+ "subtitle": "OS-specific-sections",
+ "source_file": "tests/test_files/ftps/tps1.md",
+ "title_depth": 2,
+ "directory": "tps1",
+ "parent_title": "Main-title",
+ "links": {
+ "0": "https://docs.hpc.ugent.be/linuxmacos"
+ },
+ "previous_title": "tps1_paragraph_001",
+ "next_title": "tps1_linux_paragraph_002.002",
+ "OS": "linux",
+ "reference_link": "https://docs.hpc.ugent.be/Linux/tps1/#os-specific-sections"
+}
\ No newline at end of file
diff --git a/scripts/HPC_chatbot_preprocessor/tests/test_files/ftps/output/parsed_mds/os_specific/linux/tps1/tps1_linux_paragraph_002.002.txt b/scripts/HPC_chatbot_preprocessor/tests/test_files/ftps/output/parsed_mds/os_specific/linux/tps1/tps1_linux_paragraph_002.002.txt
new file mode 100644
index 00000000000..1a3867e69fa
--- /dev/null
+++ b/scripts/HPC_chatbot_preprocessor/tests/test_files/ftps/output/parsed_mds/os_specific/linux/tps1/tps1_linux_paragraph_002.002.txt
@@ -0,0 +1,3 @@
+Non Windows section
+Whereas the Windows version of this section had a lot of unnecessary newlines, this one will just be a short and concise
+section that ends right here.
diff --git a/scripts/HPC_chatbot_preprocessor/tests/test_files/ftps/output/parsed_mds/os_specific/linux/tps1/tps1_linux_paragraph_002.002_metadata.json b/scripts/HPC_chatbot_preprocessor/tests/test_files/ftps/output/parsed_mds/os_specific/linux/tps1/tps1_linux_paragraph_002.002_metadata.json
new file mode 100644
index 00000000000..36cda85cfcc
--- /dev/null
+++ b/scripts/HPC_chatbot_preprocessor/tests/test_files/ftps/output/parsed_mds/os_specific/linux/tps1/tps1_linux_paragraph_002.002_metadata.json
@@ -0,0 +1,12 @@
+{
+ "main_title": "tps1",
+ "subtitle": "Non-Windows-section",
+ "source_file": "tests/test_files/ftps/tps1.md",
+ "title_depth": 3,
+ "directory": "tps1",
+ "parent_title": "OS-specific-sections",
+ "previous_title": "tps1_linux_paragraph_002.001",
+ "next_title": "tps1_paragraph_003",
+ "OS": "linux",
+ "reference_link": "https://docs.hpc.ugent.be/Linux/tps1/#non-windows-section"
+}
\ No newline at end of file
diff --git a/scripts/HPC_chatbot_preprocessor/tests/test_files/ftps/output/parsed_mds/os_specific/macos/tps1/tps1_macos_paragraph_002.001.txt b/scripts/HPC_chatbot_preprocessor/tests/test_files/ftps/output/parsed_mds/os_specific/macos/tps1/tps1_macos_paragraph_002.001.txt
new file mode 100644
index 00000000000..e0642d6ac96
--- /dev/null
+++ b/scripts/HPC_chatbot_preprocessor/tests/test_files/ftps/output/parsed_mds/os_specific/macos/tps1/tps1_macos_paragraph_002.001.txt
@@ -0,0 +1,4 @@
+OS specific sections
+This is the second section, it is the start of some
+text specific to OSes that aren't "windows". I feel like there is no need to make this section very long, however I will
+still add a link.
diff --git a/scripts/HPC_chatbot_preprocessor/tests/test_files/ftps/output/parsed_mds/os_specific/macos/tps1/tps1_macos_paragraph_002.001_metadata.json b/scripts/HPC_chatbot_preprocessor/tests/test_files/ftps/output/parsed_mds/os_specific/macos/tps1/tps1_macos_paragraph_002.001_metadata.json
new file mode 100644
index 00000000000..2de51c7c0e1
--- /dev/null
+++ b/scripts/HPC_chatbot_preprocessor/tests/test_files/ftps/output/parsed_mds/os_specific/macos/tps1/tps1_macos_paragraph_002.001_metadata.json
@@ -0,0 +1,15 @@
+{
+ "main_title": "tps1",
+ "subtitle": "OS-specific-sections",
+ "source_file": "tests/test_files/ftps/tps1.md",
+ "title_depth": 2,
+ "directory": "tps1",
+ "parent_title": "Main-title",
+ "links": {
+ "0": "https://docs.hpc.ugent.be/linuxmacos"
+ },
+ "previous_title": "tps1_paragraph_001",
+ "next_title": "tps1_macos_paragraph_002.002",
+ "OS": "macos",
+ "reference_link": "https://docs.hpc.ugent.be/macOS/tps1/#os-specific-sections"
+}
\ No newline at end of file
diff --git a/scripts/HPC_chatbot_preprocessor/tests/test_files/ftps/output/parsed_mds/os_specific/macos/tps1/tps1_macos_paragraph_002.002.txt b/scripts/HPC_chatbot_preprocessor/tests/test_files/ftps/output/parsed_mds/os_specific/macos/tps1/tps1_macos_paragraph_002.002.txt
new file mode 100644
index 00000000000..1a3867e69fa
--- /dev/null
+++ b/scripts/HPC_chatbot_preprocessor/tests/test_files/ftps/output/parsed_mds/os_specific/macos/tps1/tps1_macos_paragraph_002.002.txt
@@ -0,0 +1,3 @@
+Non Windows section
+Whereas the Windows version of this section had a lot of unnecessary newlines, this one will just be a short and concise
+section that ends right here.
diff --git a/scripts/HPC_chatbot_preprocessor/tests/test_files/ftps/output/parsed_mds/os_specific/macos/tps1/tps1_macos_paragraph_002.002_metadata.json b/scripts/HPC_chatbot_preprocessor/tests/test_files/ftps/output/parsed_mds/os_specific/macos/tps1/tps1_macos_paragraph_002.002_metadata.json
new file mode 100644
index 00000000000..fb48000e679
--- /dev/null
+++ b/scripts/HPC_chatbot_preprocessor/tests/test_files/ftps/output/parsed_mds/os_specific/macos/tps1/tps1_macos_paragraph_002.002_metadata.json
@@ -0,0 +1,12 @@
+{
+ "main_title": "tps1",
+ "subtitle": "Non-Windows-section",
+ "source_file": "tests/test_files/ftps/tps1.md",
+ "title_depth": 3,
+ "directory": "tps1",
+ "parent_title": "OS-specific-sections",
+ "previous_title": "tps1_macos_paragraph_002.001",
+ "next_title": "tps1_paragraph_003",
+ "OS": "macos",
+ "reference_link": "https://docs.hpc.ugent.be/macOS/tps1/#non-windows-section"
+}
\ No newline at end of file
diff --git a/scripts/HPC_chatbot_preprocessor/tests/test_files/ftps/output/parsed_mds/os_specific/windows/tps1/tps1_windows_paragraph_002.001.txt b/scripts/HPC_chatbot_preprocessor/tests/test_files/ftps/output/parsed_mds/os_specific/windows/tps1/tps1_windows_paragraph_002.001.txt
new file mode 100644
index 00000000000..9a9cbe1f3d2
--- /dev/null
+++ b/scripts/HPC_chatbot_preprocessor/tests/test_files/ftps/output/parsed_mds/os_specific/windows/tps1/tps1_windows_paragraph_002.001.txt
@@ -0,0 +1,7 @@
+OS specific sections
+This is the second section, it is the start of some text specific to windows.
+In this section it is probably no longer needed to test the Markdown syntax again, however I will make it somewhat longer
+to make sure we get a long section that is over the minimum required length for the next newline character to be
+classified as the end of this section. I am doing this because for the next sections I want to test whether they will be
+grouped together if they are not long enough to reach the minimum paragraph length on their own. Also, before I forget,
+let's add a link in this section as well.
diff --git a/scripts/HPC_chatbot_preprocessor/tests/test_files/ftps/output/parsed_mds/os_specific/windows/tps1/tps1_windows_paragraph_002.001_metadata.json b/scripts/HPC_chatbot_preprocessor/tests/test_files/ftps/output/parsed_mds/os_specific/windows/tps1/tps1_windows_paragraph_002.001_metadata.json
new file mode 100644
index 00000000000..00b7fcee452
--- /dev/null
+++ b/scripts/HPC_chatbot_preprocessor/tests/test_files/ftps/output/parsed_mds/os_specific/windows/tps1/tps1_windows_paragraph_002.001_metadata.json
@@ -0,0 +1,15 @@
+{
+ "main_title": "tps1",
+ "subtitle": "OS-specific-sections",
+ "source_file": "tests/test_files/ftps/tps1.md",
+ "title_depth": 2,
+ "directory": "tps1",
+ "parent_title": "Main-title",
+ "links": {
+ "0": "https://docs.hpc.ugent.be/windows"
+ },
+ "previous_title": "tps1_paragraph_001",
+ "next_title": "tps1_windows_paragraph_002.002",
+ "OS": "windows",
+ "reference_link": "https://docs.hpc.ugent.be/Windows/tps1/#os-specific-sections"
+}
\ No newline at end of file
diff --git a/scripts/HPC_chatbot_preprocessor/tests/test_files/ftps/output/parsed_mds/os_specific/windows/tps1/tps1_windows_paragraph_002.002.txt b/scripts/HPC_chatbot_preprocessor/tests/test_files/ftps/output/parsed_mds/os_specific/windows/tps1/tps1_windows_paragraph_002.002.txt
new file mode 100644
index 00000000000..6b57235f68f
--- /dev/null
+++ b/scripts/HPC_chatbot_preprocessor/tests/test_files/ftps/output/parsed_mds/os_specific/windows/tps1/tps1_windows_paragraph_002.002.txt
@@ -0,0 +1,6 @@
+Windows specific section
+Like this.
+And this.
+And also this.
+These section should all be grouped together under the windows specific section of the output. The addition of this long
+section at the end should make sure the combination of sections comes to an end here.
diff --git a/scripts/HPC_chatbot_preprocessor/tests/test_files/ftps/output/parsed_mds/os_specific/windows/tps1/tps1_windows_paragraph_002.002_metadata.json b/scripts/HPC_chatbot_preprocessor/tests/test_files/ftps/output/parsed_mds/os_specific/windows/tps1/tps1_windows_paragraph_002.002_metadata.json
new file mode 100644
index 00000000000..0e38a476d04
--- /dev/null
+++ b/scripts/HPC_chatbot_preprocessor/tests/test_files/ftps/output/parsed_mds/os_specific/windows/tps1/tps1_windows_paragraph_002.002_metadata.json
@@ -0,0 +1,12 @@
+{
+ "main_title": "tps1",
+ "subtitle": "Windows-specific-section",
+ "source_file": "tests/test_files/ftps/tps1.md",
+ "title_depth": 3,
+ "directory": "tps1",
+ "parent_title": "OS-specific-sections",
+ "previous_title": "tps1_windows_paragraph_002.001",
+ "next_title": "tps1_paragraph_003",
+ "OS": "windows",
+ "reference_link": "https://docs.hpc.ugent.be/Windows/tps1/#windows-specific-section"
+}
\ No newline at end of file
diff --git a/scripts/HPC_chatbot_preprocessor/tests/test_files/ftps/tps1.md b/scripts/HPC_chatbot_preprocessor/tests/test_files/ftps/tps1.md
new file mode 100644
index 00000000000..d9b10d0c524
--- /dev/null
+++ b/scripts/HPC_chatbot_preprocessor/tests/test_files/ftps/tps1.md
@@ -0,0 +1,43 @@
+# Main title
+
+This is the first paragraph of text. It is non-os-specific, however it does contain [a link](generic.md).
+It also contains some `other` *Markdown* _syntax_ and an
+```shell
+example code block.
+```
+This intro needs to be sufficiently long as will be explained in the following section (we want to hit the minimum
+character limit for a section).
+
+## OS specific sections
+
+This is the second section, it is the start of some {% if OS == windows %} text specific to windows.
+In this section it is probably no longer needed to test the Markdown syntax again, however I will make it somewhat longer
+to make sure we get a long section that is over the minimum required length for the next newline character to be
+classified as the end of this section. I am doing this because for the next sections I want to test whether they will be
+grouped together if they are not long enough to reach the minimum paragraph length on their own. Also, before I forget,
+let's add [a link](windows.md) in this section as well.
+
+### Windows specific section
+
+Like this.
+
+And this.
+
+And also this.
+
+These section should all be grouped together under the windows specific section of the output. The addition of this long
+section at the end should make sure the combination of sections comes to an end here.
+{% else %}
+text specific to OSes that aren't windows. I feel like there is no need to make this section very long, however I will
+still add [a link](linuxmacos.md).
+
+### Non Windows section
+
+Whereas the Windows version of this section had a lot of unnecessary newlines, this one will just be a short and concise
+section that ends right here.
+{% endif %}
+
+## Conclusion
+
+Coming up with what to write in test texts is very hard. I think I got the most important test cases in there, but I
+might add to this if needed.
diff --git a/scripts/HPC_chatbot_preprocessor/tests/test_files/ftts/output/parsed_mds/generic/tts1/Main-title/Subtitle-1/Subtitle-1.txt b/scripts/HPC_chatbot_preprocessor/tests/test_files/ftts/output/parsed_mds/generic/tts1/Main-title/Subtitle-1/Subtitle-1.txt
new file mode 100644
index 00000000000..f62a4f31fee
--- /dev/null
+++ b/scripts/HPC_chatbot_preprocessor/tests/test_files/ftts/output/parsed_mds/generic/tts1/Main-title/Subtitle-1/Subtitle-1.txt
@@ -0,0 +1,2 @@
+blablabla
+blablablabla
diff --git a/scripts/HPC_chatbot_preprocessor/tests/test_files/ftts/output/parsed_mds/generic/tts1/Main-title/Subtitle-1/Subtitle-1_metadata.json b/scripts/HPC_chatbot_preprocessor/tests/test_files/ftts/output/parsed_mds/generic/tts1/Main-title/Subtitle-1/Subtitle-1_metadata.json
new file mode 100644
index 00000000000..e481468cefe
--- /dev/null
+++ b/scripts/HPC_chatbot_preprocessor/tests/test_files/ftts/output/parsed_mds/generic/tts1/Main-title/Subtitle-1/Subtitle-1_metadata.json
@@ -0,0 +1,12 @@
+{
+ "main_title": "tts1",
+ "subtitle": "Subtitle-1",
+ "source_file": "tests/test_files/ftts/tts1.md",
+ "title_depth": 2,
+ "directory": "tts1/Main-title/Subtitle-1",
+ "parent_title": "Main-title",
+ "previous_title": "Main-title",
+ "next_title": "Subtitle-2-g",
+ "OS": "generic",
+ "reference_link": "https://docs.hpc.ugent.be/tts1/#subtitle-1"
+}
\ No newline at end of file
diff --git a/scripts/HPC_chatbot_preprocessor/tests/test_files/ftts/output/parsed_mds/generic/tts1/Main-title/Subtitle-5-g/Subtitle-5-g.txt b/scripts/HPC_chatbot_preprocessor/tests/test_files/ftts/output/parsed_mds/generic/tts1/Main-title/Subtitle-5-g/Subtitle-5-g.txt
new file mode 100644
index 00000000000..bdf68551202
--- /dev/null
+++ b/scripts/HPC_chatbot_preprocessor/tests/test_files/ftts/output/parsed_mds/generic/tts1/Main-title/Subtitle-5-g/Subtitle-5-g.txt
@@ -0,0 +1 @@
+blablabla
\ No newline at end of file
diff --git a/scripts/HPC_chatbot_preprocessor/tests/test_files/ftts/output/parsed_mds/generic/tts1/Main-title/Subtitle-5-g/Subtitle-5-g_metadata.json b/scripts/HPC_chatbot_preprocessor/tests/test_files/ftts/output/parsed_mds/generic/tts1/Main-title/Subtitle-5-g/Subtitle-5-g_metadata.json
new file mode 100644
index 00000000000..100766dd865
--- /dev/null
+++ b/scripts/HPC_chatbot_preprocessor/tests/test_files/ftts/output/parsed_mds/generic/tts1/Main-title/Subtitle-5-g/Subtitle-5-g_metadata.json
@@ -0,0 +1,12 @@
+{
+ "main_title": "tts1",
+ "subtitle": "Subtitle-5-g",
+ "source_file": "tests/test_files/ftts/tts1.md",
+ "title_depth": 2,
+ "directory": "tts1/Main-title/Subtitle-5-g",
+ "parent_title": "Main-title",
+ "previous_title": "Subtitle-2-g",
+ "next_title": null,
+ "OS": "generic",
+ "reference_link": "https://docs.hpc.ugent.be/tts1/#subtitle-5-g"
+}
\ No newline at end of file
diff --git a/scripts/HPC_chatbot_preprocessor/tests/test_files/ftts/output/parsed_mds/os_specific/linux/tts1/Main-title/Subtitle-2-g/Subtitle-2-g.txt b/scripts/HPC_chatbot_preprocessor/tests/test_files/ftts/output/parsed_mds/os_specific/linux/tts1/Main-title/Subtitle-2-g/Subtitle-2-g.txt
new file mode 100644
index 00000000000..48125d91679
--- /dev/null
+++ b/scripts/HPC_chatbot_preprocessor/tests/test_files/ftts/output/parsed_mds/os_specific/linux/tts1/Main-title/Subtitle-2-g/Subtitle-2-g.txt
@@ -0,0 +1,4 @@
+blablabla generic
+blablabla generic
+blablabla Linux macOS
+blablablabla Linux macOS with a link
diff --git a/scripts/HPC_chatbot_preprocessor/tests/test_files/ftts/output/parsed_mds/os_specific/linux/tts1/Main-title/Subtitle-2-g/Subtitle-2-g_metadata.json b/scripts/HPC_chatbot_preprocessor/tests/test_files/ftts/output/parsed_mds/os_specific/linux/tts1/Main-title/Subtitle-2-g/Subtitle-2-g_metadata.json
new file mode 100644
index 00000000000..6f42345d013
--- /dev/null
+++ b/scripts/HPC_chatbot_preprocessor/tests/test_files/ftts/output/parsed_mds/os_specific/linux/tts1/Main-title/Subtitle-2-g/Subtitle-2-g_metadata.json
@@ -0,0 +1,15 @@
+{
+ "main_title": "tts1",
+ "subtitle": "Subtitle-2-g",
+ "source_file": "tests/test_files/ftts/tts1.md",
+ "title_depth": 2,
+ "directory": "tts1/Main-title/Subtitle-2-g",
+ "parent_title": "Main-title",
+ "links": {
+ "0": "https://docs.hpc.ugent.be/linuxmacos"
+ },
+ "previous_title": "Subtitle-1",
+ "next_title": "Subtitle-4-l&m",
+ "OS": "linux",
+ "reference_link": "https://docs.hpc.ugent.be/Linux/tts1/#subtitle-2-g"
+}
\ No newline at end of file
diff --git a/scripts/HPC_chatbot_preprocessor/tests/test_files/ftts/output/parsed_mds/os_specific/linux/tts1/Main-title/Subtitle-2-g/Subtitle-4-l&m/Subtitle-4-l&m.txt b/scripts/HPC_chatbot_preprocessor/tests/test_files/ftts/output/parsed_mds/os_specific/linux/tts1/Main-title/Subtitle-2-g/Subtitle-4-l&m/Subtitle-4-l&m.txt
new file mode 100644
index 00000000000..b221f26074b
--- /dev/null
+++ b/scripts/HPC_chatbot_preprocessor/tests/test_files/ftts/output/parsed_mds/os_specific/linux/tts1/Main-title/Subtitle-2-g/Subtitle-4-l&m/Subtitle-4-l&m.txt
@@ -0,0 +1,3 @@
+blablabla Linux macOS
+blablablabla Linux macOS
+blablabla generic with a link
\ No newline at end of file
diff --git a/scripts/HPC_chatbot_preprocessor/tests/test_files/ftts/output/parsed_mds/os_specific/linux/tts1/Main-title/Subtitle-2-g/Subtitle-4-l&m/Subtitle-4-l&m_metadata.json b/scripts/HPC_chatbot_preprocessor/tests/test_files/ftts/output/parsed_mds/os_specific/linux/tts1/Main-title/Subtitle-2-g/Subtitle-4-l&m/Subtitle-4-l&m_metadata.json
new file mode 100644
index 00000000000..351b6f5cca6
--- /dev/null
+++ b/scripts/HPC_chatbot_preprocessor/tests/test_files/ftts/output/parsed_mds/os_specific/linux/tts1/Main-title/Subtitle-2-g/Subtitle-4-l&m/Subtitle-4-l&m_metadata.json
@@ -0,0 +1,15 @@
+{
+ "main_title": "tts1",
+ "subtitle": "Subtitle-4-l&m",
+ "source_file": "tests/test_files/ftts/tts1.md",
+ "title_depth": 3,
+ "directory": "tts1/Main-title/Subtitle-2-g/Subtitle-4-l&m",
+ "parent_title": "Subtitle-2-g",
+ "links": {
+ "0": "https://docs.hpc.ugent.be/generic"
+ },
+ "previous_title": "Subtitle-2-g",
+ "next_title": "Subtitle-5-g",
+ "OS": "linux",
+ "reference_link": "https://docs.hpc.ugent.be/Linux/tts1/#subtitle-4-lm"
+}
\ No newline at end of file
diff --git a/scripts/HPC_chatbot_preprocessor/tests/test_files/ftts/output/parsed_mds/os_specific/macos/tts1/Main-title/Subtitle-2-g/Subtitle-2-g.txt b/scripts/HPC_chatbot_preprocessor/tests/test_files/ftts/output/parsed_mds/os_specific/macos/tts1/Main-title/Subtitle-2-g/Subtitle-2-g.txt
new file mode 100644
index 00000000000..48125d91679
--- /dev/null
+++ b/scripts/HPC_chatbot_preprocessor/tests/test_files/ftts/output/parsed_mds/os_specific/macos/tts1/Main-title/Subtitle-2-g/Subtitle-2-g.txt
@@ -0,0 +1,4 @@
+blablabla generic
+blablabla generic
+blablabla Linux macOS
+blablablabla Linux macOS with a link
diff --git a/scripts/HPC_chatbot_preprocessor/tests/test_files/ftts/output/parsed_mds/os_specific/macos/tts1/Main-title/Subtitle-2-g/Subtitle-2-g_metadata.json b/scripts/HPC_chatbot_preprocessor/tests/test_files/ftts/output/parsed_mds/os_specific/macos/tts1/Main-title/Subtitle-2-g/Subtitle-2-g_metadata.json
new file mode 100644
index 00000000000..30249d3d155
--- /dev/null
+++ b/scripts/HPC_chatbot_preprocessor/tests/test_files/ftts/output/parsed_mds/os_specific/macos/tts1/Main-title/Subtitle-2-g/Subtitle-2-g_metadata.json
@@ -0,0 +1,15 @@
+{
+ "main_title": "tts1",
+ "subtitle": "Subtitle-2-g",
+ "source_file": "tests/test_files/ftts/tts1.md",
+ "title_depth": 2,
+ "directory": "tts1/Main-title/Subtitle-2-g",
+ "parent_title": "Main-title",
+ "links": {
+ "0": "https://docs.hpc.ugent.be/linuxmacos"
+ },
+ "previous_title": "Subtitle-1",
+ "next_title": "Subtitle-4-l&m",
+ "OS": "macos",
+ "reference_link": "https://docs.hpc.ugent.be/macOS/tts1/#subtitle-2-g"
+}
\ No newline at end of file
diff --git a/scripts/HPC_chatbot_preprocessor/tests/test_files/ftts/output/parsed_mds/os_specific/macos/tts1/Main-title/Subtitle-2-g/Subtitle-4-l&m/Subtitle-4-l&m.txt b/scripts/HPC_chatbot_preprocessor/tests/test_files/ftts/output/parsed_mds/os_specific/macos/tts1/Main-title/Subtitle-2-g/Subtitle-4-l&m/Subtitle-4-l&m.txt
new file mode 100644
index 00000000000..b221f26074b
--- /dev/null
+++ b/scripts/HPC_chatbot_preprocessor/tests/test_files/ftts/output/parsed_mds/os_specific/macos/tts1/Main-title/Subtitle-2-g/Subtitle-4-l&m/Subtitle-4-l&m.txt
@@ -0,0 +1,3 @@
+blablabla Linux macOS
+blablablabla Linux macOS
+blablabla generic with a link
\ No newline at end of file
diff --git a/scripts/HPC_chatbot_preprocessor/tests/test_files/ftts/output/parsed_mds/os_specific/macos/tts1/Main-title/Subtitle-2-g/Subtitle-4-l&m/Subtitle-4-l&m_metadata.json b/scripts/HPC_chatbot_preprocessor/tests/test_files/ftts/output/parsed_mds/os_specific/macos/tts1/Main-title/Subtitle-2-g/Subtitle-4-l&m/Subtitle-4-l&m_metadata.json
new file mode 100644
index 00000000000..087fe810609
--- /dev/null
+++ b/scripts/HPC_chatbot_preprocessor/tests/test_files/ftts/output/parsed_mds/os_specific/macos/tts1/Main-title/Subtitle-2-g/Subtitle-4-l&m/Subtitle-4-l&m_metadata.json
@@ -0,0 +1,15 @@
+{
+ "main_title": "tts1",
+ "subtitle": "Subtitle-4-l&m",
+ "source_file": "tests/test_files/ftts/tts1.md",
+ "title_depth": 3,
+ "directory": "tts1/Main-title/Subtitle-2-g/Subtitle-4-l&m",
+ "parent_title": "Subtitle-2-g",
+ "links": {
+ "0": "https://docs.hpc.ugent.be/generic"
+ },
+ "previous_title": "Subtitle-2-g",
+ "next_title": "Subtitle-5-g",
+ "OS": "macos",
+ "reference_link": "https://docs.hpc.ugent.be/macOS/tts1/#subtitle-4-lm"
+}
\ No newline at end of file
diff --git a/scripts/HPC_chatbot_preprocessor/tests/test_files/ftts/output/parsed_mds/os_specific/windows/tts1/Main-title/Subtitle-2-g/Subtitle-2-g.txt b/scripts/HPC_chatbot_preprocessor/tests/test_files/ftts/output/parsed_mds/os_specific/windows/tts1/Main-title/Subtitle-2-g/Subtitle-2-g.txt
new file mode 100644
index 00000000000..f9f20592832
--- /dev/null
+++ b/scripts/HPC_chatbot_preprocessor/tests/test_files/ftts/output/parsed_mds/os_specific/windows/tts1/Main-title/Subtitle-2-g/Subtitle-2-g.txt
@@ -0,0 +1,4 @@
+blablabla generic
+blablabla generic
+blablabla windows
+blablabla windows with a link
diff --git a/scripts/HPC_chatbot_preprocessor/tests/test_files/ftts/output/parsed_mds/os_specific/windows/tts1/Main-title/Subtitle-2-g/Subtitle-2-g_metadata.json b/scripts/HPC_chatbot_preprocessor/tests/test_files/ftts/output/parsed_mds/os_specific/windows/tts1/Main-title/Subtitle-2-g/Subtitle-2-g_metadata.json
new file mode 100644
index 00000000000..da3c61d3edc
--- /dev/null
+++ b/scripts/HPC_chatbot_preprocessor/tests/test_files/ftts/output/parsed_mds/os_specific/windows/tts1/Main-title/Subtitle-2-g/Subtitle-2-g_metadata.json
@@ -0,0 +1,15 @@
+{
+ "main_title": "tts1",
+ "subtitle": "Subtitle-2-g",
+ "source_file": "tests/test_files/ftts/tts1.md",
+ "title_depth": 2,
+ "directory": "tts1/Main-title/Subtitle-2-g",
+ "parent_title": "Main-title",
+ "links": {
+ "0": "https://docs.hpc.ugent.be/windows"
+ },
+ "previous_title": "Subtitle-1",
+ "next_title": "Subtitle-3-w",
+ "OS": "windows",
+ "reference_link": "https://docs.hpc.ugent.be/Windows/tts1/#subtitle-2-g"
+}
\ No newline at end of file
diff --git a/scripts/HPC_chatbot_preprocessor/tests/test_files/ftts/output/parsed_mds/os_specific/windows/tts1/Main-title/Subtitle-2-g/Subtitle-3-w/Subtitle-3-w.txt b/scripts/HPC_chatbot_preprocessor/tests/test_files/ftts/output/parsed_mds/os_specific/windows/tts1/Main-title/Subtitle-2-g/Subtitle-3-w/Subtitle-3-w.txt
new file mode 100644
index 00000000000..0b587cef85a
--- /dev/null
+++ b/scripts/HPC_chatbot_preprocessor/tests/test_files/ftts/output/parsed_mds/os_specific/windows/tts1/Main-title/Subtitle-2-g/Subtitle-3-w/Subtitle-3-w.txt
@@ -0,0 +1,3 @@
+blablabla windows
+blablablabla windows
+blablabla generic with a link
\ No newline at end of file
diff --git a/scripts/HPC_chatbot_preprocessor/tests/test_files/ftts/output/parsed_mds/os_specific/windows/tts1/Main-title/Subtitle-2-g/Subtitle-3-w/Subtitle-3-w_metadata.json b/scripts/HPC_chatbot_preprocessor/tests/test_files/ftts/output/parsed_mds/os_specific/windows/tts1/Main-title/Subtitle-2-g/Subtitle-3-w/Subtitle-3-w_metadata.json
new file mode 100644
index 00000000000..e07586cf55e
--- /dev/null
+++ b/scripts/HPC_chatbot_preprocessor/tests/test_files/ftts/output/parsed_mds/os_specific/windows/tts1/Main-title/Subtitle-2-g/Subtitle-3-w/Subtitle-3-w_metadata.json
@@ -0,0 +1,15 @@
+{
+ "main_title": "tts1",
+ "subtitle": "Subtitle-3-w",
+ "source_file": "tests/test_files/ftts/tts1.md",
+ "title_depth": 3,
+ "directory": "tts1/Main-title/Subtitle-2-g/Subtitle-3-w",
+ "parent_title": "Subtitle-2-g",
+ "links": {
+ "0": "https://docs.hpc.ugent.be/generic"
+ },
+ "previous_title": "Subtitle-2-g",
+ "next_title": "Subtitle-5-g",
+ "OS": "windows",
+ "reference_link": "https://docs.hpc.ugent.be/Windows/tts1/#subtitle-3-w"
+}
\ No newline at end of file
diff --git a/scripts/HPC_chatbot_preprocessor/tests/test_files/ftts/tts1.md b/scripts/HPC_chatbot_preprocessor/tests/test_files/ftts/tts1.md
new file mode 100644
index 00000000000..2f3ad7f9c08
--- /dev/null
+++ b/scripts/HPC_chatbot_preprocessor/tests/test_files/ftts/tts1.md
@@ -0,0 +1,31 @@
+# Main title
+
+## Subtitle 1
+
+blablabla
+blablablabla
+
+## Subtitle 2 g
+
+blablabla generic
+blablabla generic
+{% if OS == windows %}blablabla windows
+blablabla windows with a [link](windows.md)
+
+### Subtitle 3 w
+
+blablabla windows
+blablablabla windows
+{% else %}blablabla Linux macOS
+blablablabla Linux macOS with a [link](linuxmacos.md)
+
+### Subtitle 4 l&m
+
+blablabla Linux macOS
+blablablabla Linux macOS
+{% endif %}
+blablabla generic with a [link](generic.md)
+
+## Subtitle 5 g
+
+blablabla
diff --git a/scripts/HPC_chatbot_preprocessor/tests/test_files/if_mangler_test_files/if_mangler_1_input.md b/scripts/HPC_chatbot_preprocessor/tests/test_files/if_mangler_test_files/if_mangler_1_input.md
new file mode 100644
index 00000000000..6a74b3c0181
--- /dev/null
+++ b/scripts/HPC_chatbot_preprocessor/tests/test_files/if_mangler_test_files/if_mangler_1_input.md
@@ -0,0 +1,4 @@
+test1: OS_IF
+{% if OS == windows %}
+test1
+{% endif %}
\ No newline at end of file
diff --git a/scripts/HPC_chatbot_preprocessor/tests/test_files/if_mangler_test_files/if_mangler_1_output.md b/scripts/HPC_chatbot_preprocessor/tests/test_files/if_mangler_test_files/if_mangler_1_output.md
new file mode 100644
index 00000000000..2f9cdc38294
--- /dev/null
+++ b/scripts/HPC_chatbot_preprocessor/tests/test_files/if_mangler_test_files/if_mangler_1_output.md
@@ -0,0 +1,4 @@
+test1: OS_IF
+{-if-% if OS == windows %-if-}
+test1
+{-if-% endif %-if-}
\ No newline at end of file
diff --git a/scripts/HPC_chatbot_preprocessor/tests/test_files/if_mangler_test_files/if_mangler_2_input.md b/scripts/HPC_chatbot_preprocessor/tests/test_files/if_mangler_test_files/if_mangler_2_input.md
new file mode 100644
index 00000000000..360a4a59ba3
--- /dev/null
+++ b/scripts/HPC_chatbot_preprocessor/tests/test_files/if_mangler_test_files/if_mangler_2_input.md
@@ -0,0 +1,7 @@
+test2: OS_IF in NON_OS_IF
+{% if site == Gent %}
+test2
+{% if OS == windows %}
+test2
+{% endif %}
+{% endif %}
\ No newline at end of file
diff --git a/scripts/HPC_chatbot_preprocessor/tests/test_files/if_mangler_test_files/if_mangler_2_output.md b/scripts/HPC_chatbot_preprocessor/tests/test_files/if_mangler_test_files/if_mangler_2_output.md
new file mode 100644
index 00000000000..798dcf6db24
--- /dev/null
+++ b/scripts/HPC_chatbot_preprocessor/tests/test_files/if_mangler_test_files/if_mangler_2_output.md
@@ -0,0 +1,7 @@
+test2: OS_IF in NON_OS_IF
+{% if site == Gent %}
+test2
+{-if-% if OS == windows %-if-}
+test2
+{-if-% endif %-if-}
+{% endif %}
\ No newline at end of file
diff --git a/scripts/HPC_chatbot_preprocessor/tests/test_files/if_mangler_test_files/if_mangler_3_input.md b/scripts/HPC_chatbot_preprocessor/tests/test_files/if_mangler_test_files/if_mangler_3_input.md
new file mode 100644
index 00000000000..d93125a5971
--- /dev/null
+++ b/scripts/HPC_chatbot_preprocessor/tests/test_files/if_mangler_test_files/if_mangler_3_input.md
@@ -0,0 +1,6 @@
+test3: OS_IF with else
+{% if OS == linux %}
+test3
+{% else %}
+test3
+{% endif %}
\ No newline at end of file
diff --git a/scripts/HPC_chatbot_preprocessor/tests/test_files/if_mangler_test_files/if_mangler_3_output.md b/scripts/HPC_chatbot_preprocessor/tests/test_files/if_mangler_test_files/if_mangler_3_output.md
new file mode 100644
index 00000000000..02141961338
--- /dev/null
+++ b/scripts/HPC_chatbot_preprocessor/tests/test_files/if_mangler_test_files/if_mangler_3_output.md
@@ -0,0 +1,6 @@
+test3: OS_IF with else
+{-if-% if OS == linux %-if-}
+test3
+{-if-% else %-if-}
+test3
+{-if-% endif %-if-}
\ No newline at end of file
diff --git a/scripts/HPC_chatbot_preprocessor/tests/test_files/if_mangler_test_files/if_mangler_4_input.md b/scripts/HPC_chatbot_preprocessor/tests/test_files/if_mangler_test_files/if_mangler_4_input.md
new file mode 100644
index 00000000000..cc15fae1df1
--- /dev/null
+++ b/scripts/HPC_chatbot_preprocessor/tests/test_files/if_mangler_test_files/if_mangler_4_input.md
@@ -0,0 +1,4 @@
+test4: OS_IF with wrong syntax
+{ if OS == macos }
+test4
+{ endif }
\ No newline at end of file
diff --git a/scripts/HPC_chatbot_preprocessor/tests/test_files/if_mangler_test_files/if_mangler_4_output.md b/scripts/HPC_chatbot_preprocessor/tests/test_files/if_mangler_test_files/if_mangler_4_output.md
new file mode 100644
index 00000000000..cc15fae1df1
--- /dev/null
+++ b/scripts/HPC_chatbot_preprocessor/tests/test_files/if_mangler_test_files/if_mangler_4_output.md
@@ -0,0 +1,4 @@
+test4: OS_IF with wrong syntax
+{ if OS == macos }
+test4
+{ endif }
\ No newline at end of file
diff --git a/scripts/HPC_chatbot_preprocessor/tests/test_files/if_mangler_test_files/if_mangler_5_input.md b/scripts/HPC_chatbot_preprocessor/tests/test_files/if_mangler_test_files/if_mangler_5_input.md
new file mode 100644
index 00000000000..bdb288474e2
--- /dev/null
+++ b/scripts/HPC_chatbot_preprocessor/tests/test_files/if_mangler_test_files/if_mangler_5_input.md
@@ -0,0 +1,11 @@
+test5: OS_IF in OS_IF
+{% if OS == windows %}
+test5
+{% else %}
+{% if OS == linux %}
+test5
+{% else %}
+test5
+{% endif %}
+test5
+{% endif %}
\ No newline at end of file
diff --git a/scripts/HPC_chatbot_preprocessor/tests/test_files/if_mangler_test_files/if_mangler_5_output.md b/scripts/HPC_chatbot_preprocessor/tests/test_files/if_mangler_test_files/if_mangler_5_output.md
new file mode 100644
index 00000000000..10443eb67a4
--- /dev/null
+++ b/scripts/HPC_chatbot_preprocessor/tests/test_files/if_mangler_test_files/if_mangler_5_output.md
@@ -0,0 +1,11 @@
+test5: OS_IF in OS_IF
+{-if-% if OS == windows %-if-}
+test5
+{-if-% else %-if-}
+{-if-% if OS == linux %-if-}
+test5
+{-if-% else %-if-}
+test5
+{-if-% endif %-if-}
+test5
+{-if-% endif %-if-}
\ No newline at end of file
diff --git a/scripts/HPC_chatbot_preprocessor/tests/test_files/if_mangler_test_files/if_mangler_6_input.md b/scripts/HPC_chatbot_preprocessor/tests/test_files/if_mangler_test_files/if_mangler_6_input.md
new file mode 100644
index 00000000000..0731ee3588c
--- /dev/null
+++ b/scripts/HPC_chatbot_preprocessor/tests/test_files/if_mangler_test_files/if_mangler_6_input.md
@@ -0,0 +1,8 @@
+test6: NON_OS_IF in OS_IF
+{% if OS == macos %}
+test6
+{% if site == Gent %}
+test6
+{% endif %}
+test6
+{% endif %}
\ No newline at end of file
diff --git a/scripts/HPC_chatbot_preprocessor/tests/test_files/if_mangler_test_files/if_mangler_6_output.md b/scripts/HPC_chatbot_preprocessor/tests/test_files/if_mangler_test_files/if_mangler_6_output.md
new file mode 100644
index 00000000000..cd37117cb00
--- /dev/null
+++ b/scripts/HPC_chatbot_preprocessor/tests/test_files/if_mangler_test_files/if_mangler_6_output.md
@@ -0,0 +1,8 @@
+test6: NON_OS_IF in OS_IF
+{-if-% if OS == macos %-if-}
+test6
+{% if site == Gent %}
+test6
+{% endif %}
+test6
+{-if-% endif %-if-}
\ No newline at end of file
diff --git a/scripts/HPC_chatbot_preprocessor/tests/test_files/if_mangler_test_files/if_mangler_7_input.md b/scripts/HPC_chatbot_preprocessor/tests/test_files/if_mangler_test_files/if_mangler_7_input.md
new file mode 100644
index 00000000000..6a72a338527
--- /dev/null
+++ b/scripts/HPC_chatbot_preprocessor/tests/test_files/if_mangler_test_files/if_mangler_7_input.md
@@ -0,0 +1,9 @@
+test7: weird spacing and dashes
+ {%if OS == windows %}
+ test7
+{%- else%}
+ test7
+ {% if OS == linux%}
+test7
+ {%-endif %}
+{%endif%}
\ No newline at end of file
diff --git a/scripts/HPC_chatbot_preprocessor/tests/test_files/if_mangler_test_files/if_mangler_7_output.md b/scripts/HPC_chatbot_preprocessor/tests/test_files/if_mangler_test_files/if_mangler_7_output.md
new file mode 100644
index 00000000000..dfe342ebfb1
--- /dev/null
+++ b/scripts/HPC_chatbot_preprocessor/tests/test_files/if_mangler_test_files/if_mangler_7_output.md
@@ -0,0 +1,9 @@
+test7: weird spacing and dashes
+ {-if-%if OS == windows %-if-}
+ test7
+{-if-%- else%-if-}
+ test7
+ {-if-% if OS == linux%-if-}
+test7
+ {-if-%-endif %-if-}
+{-if-%endif%-if-}
\ No newline at end of file
diff --git a/scripts/HPC_chatbot_preprocessor/tests/test_files/if_mangler_test_files/if_mangler_input.md b/scripts/HPC_chatbot_preprocessor/tests/test_files/if_mangler_test_files/if_mangler_input.md
new file mode 100644
index 00000000000..fb8c1f8b539
--- /dev/null
+++ b/scripts/HPC_chatbot_preprocessor/tests/test_files/if_mangler_test_files/if_mangler_input.md
@@ -0,0 +1,55 @@
+test1: OS_IF
+{% if OS == windows %}
+test1
+{% endif %}
+
+test2: OS_IF in NON_OS_IF
+{% if site == Gent %}
+test2
+{% if OS == windows %}
+test2
+{% endif %}
+{% endif %}
+
+test3: OS_IF with else
+{% if OS == linux %}
+test3
+{% else %}
+test3
+{% endif %}
+
+test4: OS_IF with wrong syntax
+{ if OS == macos }
+test4
+{ endif }
+
+test5: OS_IF in OS_IF
+{% if OS == windows %}
+test5
+{% else %}
+{% if OS == linux %}
+test5
+{% else %}
+test5
+{% endif %}
+test5
+{% endif %}
+
+test6: NON_OS_IF in OS_IF
+{% if OS == macos %}
+test6
+{% if site == Gent %}
+test6
+{% endif %}
+test6
+{% endif %}
+
+test7: weird spacing and dashes
+ {%if OS == windows %}
+ test7
+{%- else%}
+ test7
+ {% if OS == linux%}
+test7
+ {%-endif %}
+{%endif%}
\ No newline at end of file
diff --git a/scripts/HPC_chatbot_preprocessor/tests/test_files/if_mangler_test_files/if_mangler_output.md b/scripts/HPC_chatbot_preprocessor/tests/test_files/if_mangler_test_files/if_mangler_output.md
new file mode 100644
index 00000000000..796e94348fa
--- /dev/null
+++ b/scripts/HPC_chatbot_preprocessor/tests/test_files/if_mangler_test_files/if_mangler_output.md
@@ -0,0 +1,55 @@
+test1: OS_IF
+{-if-% if OS == windows %-if-}
+test1
+{-if-% endif %-if-}
+
+test2: OS_IF in NON_OS_IF
+{% if site == Gent %}
+test2
+{-if-% if OS == windows %-if-}
+test2
+{-if-% endif %-if-}
+{% endif %}
+
+test3: OS_IF with else
+{-if-% if OS == linux %-if-}
+test3
+{-if-% else %-if-}
+test3
+{-if-% endif %-if-}
+
+test4: OS_IF with wrong syntax
+{ if OS == macos }
+test4
+{ endif }
+
+test5: OS_IF in OS_IF
+{-if-% if OS == windows %-if-}
+test5
+{-if-% else %-if-}
+{-if-% if OS == linux %-if-}
+test5
+{-if-% else %-if-}
+test5
+{-if-% endif %-if-}
+test5
+{-if-% endif %-if-}
+
+test6: NON_OS_IF in OS_IF
+{-if-% if OS == macos %-if-}
+test6
+{% if site == Gent %}
+test6
+{% endif %}
+test6
+{-if-% endif %-if-}
+
+test7: weird spacing and dashes
+ {-if-%if OS == windows %-if-}
+ test7
+{-if-%- else%-if-}
+ test7
+ {-if-% if OS == linux%-if-}
+test7
+ {-if-%-endif %-if-}
+{-if-%endif%-if-}
\ No newline at end of file
diff --git a/scripts/HPC_chatbot_preprocessor/tests/test_files/list_file/list_test.md b/scripts/HPC_chatbot_preprocessor/tests/test_files/list_file/list_test.md
new file mode 100644
index 00000000000..1e18a1495d5
--- /dev/null
+++ b/scripts/HPC_chatbot_preprocessor/tests/test_files/list_file/list_test.md
@@ -0,0 +1,15 @@
+# Title
+
+Some explanation about the following list that is quite long. This could be problematic since this could mean that the explanation of the content of the list would be part of a different paragraph than the list.
+
+1. First entry that is very verbose since we want to hit the character limit for a paragraph to make sure a list can't be split in the middle. If this entry is long enough, the character limit should make it so that any of the following newlines can be the start of a new section if the splitter doesn't know it is in a list.
+
+2. Second entry
+
+3. Third entry
+
+ ![image](img/an_image_for_the_third_entry.png)
+
+4. Fourth entry that is very verbose, so we hit the character limit for a section split, even though it shouldn't be necessary since the explanation of the list is already well above the character limit.
+
+And now the text continues like normal in a new section.
\ No newline at end of file
diff --git a/scripts/HPC_chatbot_preprocessor/tests/test_full_script.py b/scripts/HPC_chatbot_preprocessor/tests/test_full_script.py
new file mode 100644
index 00000000000..99baf41ebc0
--- /dev/null
+++ b/scripts/HPC_chatbot_preprocessor/tests/test_full_script.py
@@ -0,0 +1,68 @@
+import pytest
+import os
+import shutil
+from chatbot_parser import main
+
+
+@pytest.mark.parametrize("input_directory,actual_output_directory,expected_output_directory, options", [
+ ("tests/test_files/ftps", "tests/test_files/ftps/actual",
+ "tests/test_files/ftps/output",
+ {"SOURCE_DIRECTORY": "tests/test_files/ftps",
+ "DESTINATION_DIRECTORY": "tests/test_files/ftps/actual",
+ "SPLIT_ON_TITLES": False,
+ "SPLIT_ON_PARAGRAPHS": True,
+ "MIN_PARAGRAPH_LENGTH": 50,
+ "MAX_TITLE_DEPTH": 4,
+ "INCLUDE_LINKS_IN_PLAINTEXT": False,
+ "DEEP_DIRECTORIES": False,
+ "VERBOSE": False}
+ ),
+ ("tests/test_files/ftts", "tests/test_files/ftts/actual",
+ "tests/test_files/ftts/output",
+ {"SOURCE_DIRECTORY": "tests/test_files/ftts",
+ "DESTINATION_DIRECTORY": "tests/test_files/ftts/actual",
+ "SPLIT_ON_TITLES": True,
+ "SPLIT_ON_PARAGRAPHS": False,
+ "MIN_PARAGRAPH_LENGTH": 160,
+ "MAX_TITLE_DEPTH": 4,
+ "INCLUDE_LINKS_IN_PLAINTEXT": False,
+ "DEEP_DIRECTORIES": True,
+ "VERBOSE": False}
+ )
+])
+def test_full_script_generated_directories(input_directory, actual_output_directory, expected_output_directory, options):
+ # run the script
+ main(options)
+
+ # Compare directories and files
+ for dirpath, dirnames, filenames in os.walk(expected_output_directory):
+ relative_path = os.path.relpath(dirpath, expected_output_directory)
+ actual_dir = os.path.join(actual_output_directory, relative_path)
+
+ # Check if the directory exists
+ assert os.path.isdir(actual_dir), f"Directory '{actual_dir}' is missing."
+
+ # Check for files
+ for filename in filenames:
+ ref_file = os.path.join(dirpath, filename)
+ gen_file = os.path.join(actual_dir, filename)
+
+ # Check if the file exists
+ assert os.path.isfile(gen_file), f"File '{gen_file}' is missing."
+
+ # Check file content
+ with open(ref_file, 'r') as ref_f, open(gen_file, 'r') as gen_f:
+ ref_content = ref_f.read().strip()
+ gen_content = gen_f.read().strip()
+ assert ref_content == gen_content, f"Content of file '{gen_file}' does not match."
+
+ # check that not too many directories have been generated
+ for dirpath, dirnames, filenames in os.walk(actual_output_directory):
+ relative_path = os.path.relpath(dirpath, actual_output_directory)
+ expected_dir = os.path.join(expected_output_directory, relative_path)
+
+ # Check if the directory exists
+ assert os.path.isdir(expected_dir), f"Directory '{relative_path}' was made, but shouldn't have been."
+
+ # remove directory
+ shutil.rmtree(actual_output_directory, ignore_errors=True)
diff --git a/scripts/HPC_chatbot_preprocessor/tests/test_if_mangler.py b/scripts/HPC_chatbot_preprocessor/tests/test_if_mangler.py
new file mode 100644
index 00000000000..c2ae9fea19e
--- /dev/null
+++ b/scripts/HPC_chatbot_preprocessor/tests/test_if_mangler.py
@@ -0,0 +1,32 @@
+import pytest
+import os
+import shutil
+from chatbot_parser import mangle_ifs
+
+
+@pytest.mark.parametrize("input_file,output_file", [
+ ("if_mangler_1_input.md", "if_mangler_1_output.md"),
+ ("if_mangler_2_input.md", "if_mangler_2_output.md"),
+ ("if_mangler_3_input.md", "if_mangler_3_output.md"),
+ ("if_mangler_4_input.md", "if_mangler_4_output.md"),
+ ("if_mangler_5_input.md", "if_mangler_5_output.md"),
+ ("if_mangler_6_input.md", "if_mangler_6_output.md"),
+ ("if_mangler_7_input.md", "if_mangler_7_output.md")
+])
+def test_if_mangler(input_file, output_file):
+ # make directory
+ os.makedirs(os.path.join("if_mangled_files"), exist_ok=True)
+
+ # make filepaths
+ input_file_path = os.path.join("tests", "test_files", "if_mangler_test_files", input_file)
+ expected_output_file_path = os.path.join("tests", "test_files", "if_mangler_test_files", output_file)
+ actual_output_file_path = os.path.join("if_mangled_files", input_file)
+ mangle_ifs(input_file_path, input_file, {"DESTINATION_DIRECTORY": '.', "VERBOSE": False})
+
+ # check every line
+ with open(expected_output_file_path, "r") as expected_read_file:
+ with open(actual_output_file_path, "r") as actual_read_file:
+ assert all([expected_line == actual_line for expected_line, actual_line in zip(expected_read_file, actual_read_file)])
+
+ # remove directory
+ shutil.rmtree("if_mangled_files", ignore_errors=True)
diff --git a/scripts/HPC_chatbot_preprocessor/tests/test_insert_links.py b/scripts/HPC_chatbot_preprocessor/tests/test_insert_links.py
new file mode 100644
index 00000000000..9109f2518ad
--- /dev/null
+++ b/scripts/HPC_chatbot_preprocessor/tests/test_insert_links.py
@@ -0,0 +1,31 @@
+import pytest
+from chatbot_parser import insert_links
+
+options_include = {"INCLUDE_LINKS_IN_PLAINTEXT": True}
+options_leave_out = {"INCLUDE_LINKS_IN_PLAINTEXT": False}
+links_input = {"0": "https://first_link.com", "1": "https://second_link.be", "2": "https://docs.hpc.ugent.be/account#welcome-e-mail", "3": "https://final-link.org"}
+
+
+@pytest.mark.parametrize("text_input, options_input, text_output, new_links", [
+ # Text without links
+ # don't include links
+ ("Text without links\nand with two lines.", options_leave_out, "Text without links\nand with two lines.", {}),
+ # include links
+ ("Text without links\nand with two lines.", options_include, "Text without links\nand with two lines.", {}),
+ # Text with all links
+ # don't include links
+ ("Text with all the links\nand with multiple lines.\n§link§link§0§link§link§\n§link§link§1§link§link§\n§link§link§2§link§link§\n§link§link§3§link§link§", options_leave_out,
+ "Text with all the links\nand with multiple lines.\n\n\n\n", links_input),
+ # include links
+ ("Text with all the links\nand with multiple lines.\n§link§link§0§link§link§\n§link§link§1§link§link§\n§link§link§2§link§link§\n§link§link§3§link§link§", options_include,
+ "Text with all the links\nand with multiple lines.\n https://first_link.com \n https://second_link.be \n https://docs.hpc.ugent.be/account#welcome-e-mail \n https://final-link.org ", links_input),
+ # Text with some links
+ # don't include links
+ ("Text with all the links\nand with multiple lines.\n§link§link§1§link§link§\n§link§link§3§link§link§", options_leave_out,
+ "Text with all the links\nand with multiple lines.\n\n", {"0": "https://second_link.be", "1": "https://final-link.org"}),
+ # include links
+ ("Text with all the links\nand with multiple lines.\n§link§link§0§link§link§\n§link§link§2§link§link§", options_include,
+ "Text with all the links\nand with multiple lines.\n https://first_link.com \n https://docs.hpc.ugent.be/account#welcome-e-mail ", {"0": "https://first_link.com", "1": "https://docs.hpc.ugent.be/account#welcome-e-mail"})
+])
+def test_insert_links(text_input, options_input, text_output, new_links):
+ assert insert_links(text_input, links_input, options_input) == (text_output, new_links)
diff --git a/scripts/HPC_chatbot_preprocessor/tests/test_links.py b/scripts/HPC_chatbot_preprocessor/tests/test_links.py
new file mode 100644
index 00000000000..a13675dd3ad
--- /dev/null
+++ b/scripts/HPC_chatbot_preprocessor/tests/test_links.py
@@ -0,0 +1,71 @@
+import os
+import pytest
+from urllib import request
+from chatbot_parser import main
+import json
+
+#################################################### IMPORTANT: This test still fails because there are some invalid links in the documentation ####################################################
+
+whitelist = ["mailto:hpc@ugent.be"]
+slow_list = ["https://login.hpc.ugent.be", "https://www.edx.org/course/introduction-linux-linuxfoundationx-lfs101x-0"]
+
+options_general = {"SOURCE_DIRECTORY": "../../mkdocs/docs/HPC",
+ "DESTINATION_DIRECTORY": ".",
+ "SPLIT_ON_TITLES": False,
+ "SPLIT_ON_PARAGRAPHS": True,
+ "MIN_PARAGRAPH_LENGTH": 683,
+ "MAX_TITLE_DEPTH": 4,
+ "INCLUDE_LINKS_IN_PLAINTEXT": False,
+ "DEEP_DIRECTORIES": False,
+ "VERBOSE": False}
+options_os_specific = {"SOURCE_DIRECTORY": "../../mkdocs/docs/HPC/linux-tutorial",
+ "DESTINATION_DIRECTORY": "./linux-tutorial",
+ "SPLIT_ON_TITLES": False,
+ "SPLIT_ON_PARAGRAPHS": True,
+ "MIN_PARAGRAPH_LENGTH": 683,
+ "MAX_TITLE_DEPTH": 4,
+ "INCLUDE_LINKS_IN_PLAINTEXT": False,
+ "DEEP_DIRECTORIES": False,
+ "VERBOSE": False}
+
+
+@pytest.mark.parametrize("options", [options_general, options_os_specific])
+def test_all_links(options):
+ all_links = {}
+ main(options)
+ broken_links = {}
+ empty_links = {}
+
+ for (dirpath, dirnames, filenames) in os.walk(os.path.join(options['DESTINATION_DIRECTORY'], 'parsed_mds')):
+ for filename in filenames:
+ all_links[filename] = []
+ if filename.endswith('metadata.json'):
+ data = json.load(open(os.path.join(dirpath, filename)))
+ if 'links' in data.keys():
+ for key in data['links'].keys():
+ all_links[filename].append(data['links'][key])
+ all_links[filename].append(data['reference_link'].split("#")[0])
+
+ for filename in all_links.keys():
+ all_links[filename] = list(set(all_links[filename]))
+ for link in all_links[filename]:
+ if len(link) != 0:
+ try:
+ if link not in whitelist and link not in slow_list:
+ with request.urlopen(link) as res:
+ if res.status == 200:
+ pass
+ except:
+ print("Broken link in " + filename + ": " + link)
+ if filename in broken_links.keys():
+ broken_links[filename].append(link)
+ else:
+ broken_links[filename] = [link]
+ else:
+ print("Empty link in " + filename)
+ if filename in empty_links.keys():
+ empty_links[filename].append(link)
+ else:
+ empty_links[filename] = [link]
+ assert len(empty_links.keys()) == 0
+ assert len(broken_links.keys()) == 0
diff --git a/scripts/HPC_chatbot_preprocessor/tests/test_lists.py b/scripts/HPC_chatbot_preprocessor/tests/test_lists.py
new file mode 100644
index 00000000000..56ac3348dfa
--- /dev/null
+++ b/scripts/HPC_chatbot_preprocessor/tests/test_lists.py
@@ -0,0 +1,46 @@
+import pytest
+from chatbot_parser import split_on_paragraphs
+
+
+@pytest.mark.parametrize("file, main_title, options, is_linux_tutorial, expected_text", [
+ ("./tests/test_files/list_file/list_test.md",
+ "list_test.md",
+ {
+ "SOURCE_DIRECTORY": "./test_files/list_file",
+ "DESTINATION_DIRECTORY": "./test_files/list_file",
+ "SPLIT_ON_TITLES": False,
+ "SPLIT_ON_PARAGRAPHS": True,
+ "MIN_PARAGRAPH_LENGTH": 100,
+ "MAX_TITLE_DEPTH": 4,
+ "INCLUDE_LINKS_IN_PLAINTEXT": False,
+ "DEEP_DIRECTORIES": False,
+ "VERBOSE": False
+ },
+ False,
+ {'list_test.md_paragraph_001': 'Title\n'
+ 'Some explanation about the following list that '
+ 'is quite long. This could be problematic since '
+ 'this could mean that the explanation of the '
+ 'content of the list would be part of a '
+ 'different paragraph than the list.\n'
+ '1. First entry that is very verbose since we '
+ 'want to hit the character limit for a '
+ "paragraph to make sure a list can't be split "
+ 'in the middle. If this entry is long enough, '
+ 'the character limit should make it so that any '
+ 'of the following newlines can be the start of '
+ "a new section if the splitter doesn't know it "
+ 'is in a list.\n'
+ '2. Second entry\n'
+ '3. Third entry\n'
+ '4. Fourth entry that is very verbose, so we '
+ 'hit the character limit for a section split, '
+ "even though it shouldn't be necessary since "
+ 'the explanation of the list is already well '
+ 'above the character limit.\n',
+ 'list_test.md_paragraph_002': 'And now the text continues like normal in a '
+ 'new section.'}
+ )
+])
+def test_links(file, main_title, options, is_linux_tutorial, expected_text):
+ assert split_on_paragraphs(file, main_title, options, is_linux_tutorial)[1] == expected_text
diff --git a/scripts/HPC_chatbot_preprocessor/tests/test_make_valid_title.py b/scripts/HPC_chatbot_preprocessor/tests/test_make_valid_title.py
new file mode 100644
index 00000000000..225c368477d
--- /dev/null
+++ b/scripts/HPC_chatbot_preprocessor/tests/test_make_valid_title.py
@@ -0,0 +1,14 @@
+import pytest
+from chatbot_parser import make_valid_title
+
+
+@pytest.mark.parametrize("input_string,expected", [
+ ("", ""),
+ ("A-good-filename-with-dashes", "A-good-filename-with-dashes"),
+ (" A very good filename beginning and ending in a space ", "A-very-good-filename-beginning-and-ending-in-a-space"),
+ ("-A-very-good-filename-beginning-and-ending-in-a-dash-", "A-very-good-filename-beginning-and-ending-in-a-dash"),
+ ("A filename containing bad characters <>:\"/\\|?*\0", "A-filename-containing-bad-characters"),
+ ("A filename ending with {some jinja garbage}", "A-filename-ending-with")
+])
+def test_make_valid_title(input_string, expected):
+ assert make_valid_title(input_string) == expected
diff --git a/scripts/HPC_chatbot_preprocessor/tests/test_replace_markdown_markers.py b/scripts/HPC_chatbot_preprocessor/tests/test_replace_markdown_markers.py
new file mode 100644
index 00000000000..d9150290f34
--- /dev/null
+++ b/scripts/HPC_chatbot_preprocessor/tests/test_replace_markdown_markers.py
@@ -0,0 +1,46 @@
+import pytest
+from chatbot_parser import replace_markdown_markers
+
+
+@pytest.mark.parametrize("input_line, input_linklist, in_code_block, main_title, expected_line, expected_linklist", [
+ # baseline test
+ ("A normal line with nothing special", [], False, "", "A normal line with nothing special", []),
+ # image 1
+ ("![image](a-nice-image.png)", [], False, "", "", []),
+ # image 2
+ ("![](img/Look-at-this-photograph.png)", [], False, "", "", []),
+ # link 1 (outside docs)
+ ("A line with a [link](https://a-nice-link.com)", ["https://another-link.be"], False, "",
+ "A line with a link§link§link§1§link§link§", ["https://another-link.be", "https://a-nice-link.com"]),
+ # link 2 (another document within the docs)
+ ("A line with a [link to the docs](account.md#welcome-e-mail)", ["https://another-link.be"], False, "",
+ "A line with a link to the docs§link§link§1§link§link§", ["https://another-link.be", "https://docs.hpc.ugent.be/account/#welcome-e-mail"]),
+ # link 3 (the same document)
+ ("A line with a [link to the same doc](#welcome-e-mail)", ["https://another-link.be"], False, "account.md",
+ "A line with a link to the same doc§link§link§1§link§link§", ["https://another-link.be", "https://docs.hpc.ugent.be/account/#welcome-e-mail"]),
+ # codeblock
+ ("```shell", [], True, "", "", []),
+ # html syntax 1 (normal syntax)
+ ("A line with something in Bold", [], False, "", "A line with something in Bold", []),
+ # html syntax 2 (link)
+ ("A line with another link", ["other-website.com"], False, "",
+ "A line with another link§link§link§1§link§link§", ["other-website.com", "website.com"]),
+ # html syntax 3 (style)
+ ("A line with style
", [], False, "", "A line with style", []), + # Bot comment + ("", [], False, "", "Something about the following table", []), + # non-Bot comment + ("", [], False, "", "", []), + # something else with <> + ("A line with an example where you should put