new test for links

hpcugent · EwDa291 · Aug 8, 2024 · Aug 8, 2024 · Aug 8, 2024 · Aug 9, 2024
commit 9e297b18ef9827a20a1283053ad49c3e081044e7
diff --git a/scripts/HPC_chatbot_preprocessor/README.md b/scripts/HPC_chatbot_preprocessor/README.md
@@ -172,7 +172,7 @@ Any comments within the markdown files (for example TODO's) should follow the fo
 Comments can be written in such a way that the script will keep them as input for the bot. To do that, the marker `INPUT_FOR_BOT` should be put in front of the content of the comment as such.
 
 ```
-<!--INPUT_FOR_BOTyour comment for the bot-->
+<!--INPUT_FOR_BOT: your comment for the bot-->
 ```
 
 This will be reworked to
@@ -190,3 +190,7 @@ Due to the nature of this script, it can generate large directories with very lo
 ### Markdown lists
 
 The parser is made in a way to detect lists and not split them in multiple paragraphs. The kinds of lists it can detect is all lists with denominators `-`, `+`, `*` and list indexed with numbers or letters (one letter per list entry). It can handle  list entries being spread out over multiple lines if there is an indentation of at least two spaces. It can also handle multiple paragraph list entries in this way, as long as the indentation stays.
+
+### Links
+
+Part of the metadata of the parser are links. In order for the links to be built up in the right way, links to external sites should always start with either `https://` or `http://`.
diff --git a/scripts/HPC_chatbot_preprocessor/chatbot_parser.py b/scripts/HPC_chatbot_preprocessor/chatbot_parser.py
@@ -104,7 +104,7 @@
 METADATA_EXTENSION = "_metadata"
 
 # Marker for comments for the bot
-INPUT_FOR_BOT = "INPUT_FOR_BOT"
+INPUT_FOR_BOT = "INPUT_FOR_BOT: "
 
 # Standard strings for verbose output
 LINE = "------------------------------------------------------------------------------------------------------\n"
@@ -138,34 +138,69 @@ def check_for_title(line, in_code_block, curr_dirs, options):
         return 0
 
 
-def replace_markdown_markers(curr_line, linklist, in_code_block, main_title):
+def make_valid_link(link, main_title, is_linux_tutorial):
+    """
+    Function that converts a string to a valid link to be used in the metadata
+
+    :param link: the input string to be turned into a valid link
+    :param main_title: the main title of the file that contains the link
+    :param is_linux_tutorial: boolean indicating whether the current file is part of the linux tutorial
+    :return link: the valid link
+    """
+
+    # ugly fix for problem with links
+    linux_tutorial_files = ["beyond_the_basics", "common_pitfalls", "getting_started", "hpc_infrastructure", "index", "manipulating_files_and_directories", "navigating", "uploading_files"]
+    if is_linux_tutorial and any([linux_tutorial_files[i] in link for i in range(len(linux_tutorial_files))]):
+        linux_part = LINUX_TUTORIAL + '/'
+    else:
+        linux_part = ""
+
+    if link.startswith('http://') or link.startswith('https://') or link.startswith('mailto:'):
+        pass
+    else:
+        if link.startswith("./"):
+            link = link.replace('./', '')
+        elif link.startswith("../"):
+            link = link.replace('../', '')
+
+        if link.startswith("#"):
+            link = DOCS_URL + '/' + linux_part + main_title + "/" + link
+        elif link.endswith(".md") and ("/" not in link or "." not in link.split("/")[0]):
+            link = DOCS_URL + '/' + linux_part + link.replace(".md", "")
+        elif '.md#' in link:
+            link = DOCS_URL + '/' + linux_part + link.replace(".md", "/")
+        else:
+            link = DOCS_URL + '/' + linux_part + link
+
+    link = link.replace('index/', '').replace('/index', '')
+
+    return link
+
+
+def replace_markdown_markers(curr_line, linklist, in_code_block, main_title, is_linux_tutorial):
     """
     function that replaces certain markdown structures with the equivalent used on the website
 
     :param curr_line: the current line on which markdown structures need to be replaced
     :param linklist: the list used to store links that need to be printed at the end of the file
     :param in_code_block: boolean indicating whether the current line is part of a code block
     :param main_title: the main title of the file that is being processed
+    :param is_linux_tutorial: boolean indicating whether the current file is part of the linux tutorial
     :return curr_line: the adapted current line
     :return linklist: the updated linklist
     """
 
     # replace images with an empty line
-    if re.search(r'(?i)!\[image]\(.*?\)', curr_line) or re.search(r'!\[]\(img/.*?.png\)', curr_line):
+    if re.search(r'(?i)!\[image]\(.*?\)', curr_line) or re.search(r'!\[.*?]\(img/.*?\.png\)', curr_line):
         curr_line = ""
 
     # replace links with a reference
     matches = re.findall(r'\[(.*?)]\((.*?)\)', curr_line)
     if matches:
         for match in matches:
             curr_line = curr_line.replace(f"[{match[0]}]({match[1]})", match[0] + LINK_MARKER + str(len(linklist)) + LINK_MARKER)
-            if ".md" not in match[1]:
-                if "#" not in match[1]:
-                    linklist.append(match[1])
-                else:
-                    linklist.append(DOCS_URL + "/" + main_title.replace(".md", "") + "/" + match[1])
-            else:
-                linklist.append(DOCS_URL + "/" + match[1].replace(".md", "/").replace("index", "").rstrip("/"))
+
+            linklist.append(make_valid_link(match[1], main_title, is_linux_tutorial))
 
     # codeblock (with ``` -> always stands on a separate line, so line can be dropped)
     if '```' in curr_line:
@@ -238,13 +273,14 @@ def replace_markdown_markers(curr_line, linklist, in_code_block, main_title):
     return curr_line, linklist
 
 
-def split_text(file, main_title, options, current_paragraph_number=-1, OS=GENERIC):
+def split_text(file, main_title, options, is_linux_tutorial, current_paragraph_number=-1, OS=GENERIC):
     """
     Function that splits the text into smaller sections and makes them into two dictionaries containing text and metadata
 
     :param file: the filepath of the file to be split
     :param main_title: the main title of the file
     :param options: dictionary containing the options given by the user
+    :param is_linux_tutorial: boolean indicating whether the current file is part of the linux tutorial
     :param current_paragraph_number: number of the paragraph that is being split, only applicable when splitting an os-specific paragraph on paragraph level
     :param OS: the OS of the file to be split, only applicable when splitting an os-specific paragraph on paragraph level
     :return paragraphs_text: dictionary containing the split sections of text
@@ -253,18 +289,19 @@ def split_text(file, main_title, options, current_paragraph_number=-1, OS=GENERI
     """
 
     if options[SPLIT_ON_TITLES]:
-        return split_on_titles(file, main_title, options)
+        return split_on_titles(file, main_title, options, is_linux_tutorial)
     elif options[SPLIT_ON_PARAGRAPHS]:
-        return split_on_paragraphs(file, main_title, options, current_paragraph_number, OS)
+        return split_on_paragraphs(file, main_title, options, is_linux_tutorial, current_paragraph_number, OS)
 
 
-def split_on_titles(file, main_title, options):
+def split_on_titles(file, main_title, options, is_linux_tutorial):
     """
     Function that splits the text into smaller sections based on the subtitle structure and makes them into two dictionaries containing text and metadata
 
     :param file: the filepath of the file to be split
     :param main_title: the main title of the file
     :param options: dictionary containing the options given by the user
+    :param is_linux_tutorial: boolean indicating whether the current file is part of the linux tutorial
     :return paragraphs_text: dictionary containing the split sections of text
     :return paragraphs_metadata: dictionary containing the metadata of each split section of text
     :return subtitle_order: list containing all encountered subtitles in order of appearance
@@ -356,7 +393,7 @@ def split_on_titles(file, main_title, options):
 
                 # line is not a title
                 elif after_first_title:
-                    line, link_list = replace_markdown_markers(line, link_list, in_code_block, main_title)
+                    line, link_list = replace_markdown_markers(line, link_list, in_code_block, main_title, is_linux_tutorial)
                     if line != "\n":
                         current_paragraph += line
 
@@ -366,7 +403,7 @@ def split_on_titles(file, main_title, options):
                     last_dir = curr_dirs[last_title_level]
             else:
                 previous_contained_if = True
-                line, link_list = replace_markdown_markers(line, link_list, in_code_block, main_title)
+                line, link_list = replace_markdown_markers(line, link_list, in_code_block, main_title, is_linux_tutorial)
                 if line != "\n":
                     current_paragraph += line
 
@@ -384,13 +421,14 @@ def split_on_titles(file, main_title, options):
     return paragraphs_os_text, paragraphs_os_free_text, paragraphs_metadata, subtitle_order
 
 
-def split_on_paragraphs(file, main_title, options, current_paragraph_number=-1, OS=GENERIC):
+def split_on_paragraphs(file, main_title, options, is_linux_tutorial, current_paragraph_number=-1, OS=GENERIC):
     """
     Function that splits the text into smaller sections based on the paragraph structure and makes them into two dictionaries containing text and metadata
 
     :param file: the filepath of the file to be split
     :param main_title: the main title of the file
     :param options: dictionary containing the options given by the user
+    :param is_linux_tutorial: boolean indicating whether the current file is part of the linux tutorial
     :param current_paragraph_number: number of the paragraph that is being split, only applicable when splitting an os-specific paragraph
     :param OS: the OS of the file to be split, only applicable when splitting an os-specific paragraph
     :return paragraphs_text: dictionary containing the split sections of text
@@ -524,12 +562,12 @@ def split_on_paragraphs(file, main_title, options, current_paragraph_number=-1,
                     # make a new title
                     metadata_title = make_valid_title(line[title_level + 1:-1])
 
-                    line, link_list = replace_markdown_markers(line[title_level + 1:], link_list, in_code_block, main_title)
+                    line, link_list = replace_markdown_markers(line[title_level + 1:], link_list, in_code_block, main_title, is_linux_tutorial)
                     current_paragraph += line
 
                 # line is not a title or the beginning of a new paragraph
                 elif line != "\n" or previous_contained_if:
-                    line, link_list = replace_markdown_markers(line, link_list, in_code_block, main_title)
+                    line, link_list = replace_markdown_markers(line, link_list, in_code_block, main_title, is_linux_tutorial)
                     current_paragraph += line
 
                 # keep track of title level and directory to write to metadata upon discovering a new subtitle
@@ -538,7 +576,7 @@ def split_on_paragraphs(file, main_title, options, current_paragraph_number=-1,
                     last_dir = curr_dirs[last_title_level]
             else:
                 previous_contained_if = True
-                line, link_list = replace_markdown_markers(line, link_list, in_code_block, main_title)
+                line, link_list = replace_markdown_markers(line, link_list, in_code_block, main_title, is_linux_tutorial)
                 current_paragraph += line
 
     # create a title for the last paragraph
@@ -799,7 +837,7 @@ def make_valid_title(title):
     valid_filename = re.sub(invalid_chars, '', title)
 
     # Strip leading/trailing whitespace
-    valid_filename = valid_filename.strip().strip('-').replace(' ', '-')
+    valid_filename = valid_filename.strip().strip('-').replace(' ', '-').replace("--", "-")
 
     return valid_filename
 
@@ -889,7 +927,10 @@ def write_files(title, text, paragraphs_metadata, title_order, title_order_numbe
         os_part = ""
     else:
         os_part = LINK_OS[OS] + "/"
-    metadata[REFERENCE_LINK] = DOCS_URL + "/" + os_part + linux_part + paragraphs_metadata[title][MAIN_TITLE] + "/#" + ''.join(char.lower() for char in paragraphs_metadata[title][SUBTITLE] if char.isalnum() or char == '-').strip('-')
+    if "index" not in paragraphs_metadata[title][MAIN_TITLE]:
+        metadata[REFERENCE_LINK] = DOCS_URL + "/" + os_part + linux_part + paragraphs_metadata[title][MAIN_TITLE] + "/#" + ''.join(char.lower() for char in paragraphs_metadata[title][SUBTITLE] if char.isalnum() or char == '-').strip('-')
+    else:
+        metadata[REFERENCE_LINK] = DOCS_URL
 
     # write metadata to file
     with open(os.path.join(filepath, file_title + METADATA_EXTENSION + ".json"), 'w') as writefile:
@@ -964,7 +1005,7 @@ def split_and_write_os_specific_section(text, metadata, subtitle_order, title_or
                 writefile.write(jinja_text)
 
             # split in right way
-            _, os_specific_text, os_specific_metadata, os_subtitle_order = split_text(TEMP_JINJA_FILE, metadata[MAIN_TITLE], options, current_paragraph_number=subtitle_order[title_order_number].split('_')[-1], OS=OS)
+            _, os_specific_text, os_specific_metadata, os_subtitle_order = split_text(TEMP_JINJA_FILE, metadata[MAIN_TITLE], options, is_linux_tutorial, current_paragraph_number=subtitle_order[title_order_number].split('_')[-1], OS=OS)
 
             # prepare variables to fix metadata
             total_subtitle_order = subtitle_order[:title_order_number] + os_subtitle_order + subtitle_order[title_order_number+1:]
@@ -1110,7 +1151,7 @@ def main(options):
             print("\nSplitting the file for the first time (split in sufficiently small generic sections and large os-specific chunks)")
 
         # split the text in paragraphs
-        paragraphs_os_text, paragraphs_os_free_text, paragraphs_metadata, subtitle_order = split_text(copy_file, main_title, options)
+        paragraphs_os_text, paragraphs_os_free_text, paragraphs_metadata, subtitle_order = split_text(copy_file, main_title, options, is_linux_tutorial)
 
         if options[VERBOSE]:
             print("\nFurther splitting os-specific chunks and writing generic and os-specific sections to files with metadata")

diff --git a/...pts/HPC_chatbot_preprocessor/parsed_mds/generic/account/account_paragraph_1_metadata.json b/...pts/HPC_chatbot_preprocessor/parsed_mds/generic/account/account_paragraph_1_metadata.json
@@ -5,7 +5,7 @@
     "title_depth": 2,
     "directory": "account",
     "links": {
-        "0": "../sites/hpc_policies"
+        "0": "https://docs.hpc.ugent.be/sites/hpc_policies"
     },
     "parent_title": "",
     "previous_title": null,

diff --git a/..._chatbot_preprocessor/parsed_mds/generic/connecting/connecting_paragraph_15_metadata.json b/..._chatbot_preprocessor/parsed_mds/generic/connecting/connecting_paragraph_15_metadata.json
@@ -5,7 +5,7 @@
     "title_depth": 2,
     "directory": "connecting",
     "links": {
-        "0": "https://docs.hpc.ugent.be/connecting/../linux-tutorial/uploading_files/#copying-faster-with-rsync"
+        "0": "https://docs.hpc.ugent.be/linux-tutorial/uploading_files/#copying-faster-with-rsync"
     },
     "parent_title": "",
     "previous_title": "connecting_paragraph_14",

diff --git a/...C_chatbot_preprocessor/parsed_mds/generic/connecting/connecting_paragraph_3_metadata.json b/...C_chatbot_preprocessor/parsed_mds/generic/connecting/connecting_paragraph_3_metadata.json
@@ -6,7 +6,7 @@
     "directory": "connecting",
     "links": {
         "0": "https://docs.hpc.ugent.be/web_portal",
-        "1": "https://docs.hpc.ugent.be/connecting/../troubleshooting/#issues-connecting-to-login-node"
+        "1": "https://docs.hpc.ugent.be/troubleshooting/#issues-connecting-to-login-node"
     },
     "parent_title": "",
     "previous_title": "connecting_paragraph_2",

diff --git a/...C_chatbot_preprocessor/parsed_mds/generic/connecting/connecting_paragraph_8_metadata.json b/...C_chatbot_preprocessor/parsed_mds/generic/connecting/connecting_paragraph_8_metadata.json
@@ -5,7 +5,7 @@
     "title_depth": 2,
     "directory": "connecting",
     "links": {
-        "0": "../useful_linux_commands"
+        "0": "https://docs.hpc.ugent.be/useful_linux_commands"
     },
     "parent_title": "",
     "previous_title": "connecting_paragraph_7",

diff --git a/...eprocessor/parsed_mds/os_specific/linux/account/account_linux_paragraph_4.1_metadata.json b/...eprocessor/parsed_mds/os_specific/linux/account/account_linux_paragraph_4.1_metadata.json
@@ -6,7 +6,7 @@
     "directory": "account",
     "parent_title": "Getting-ready-to-request-an-account",
     "links": {
-        "0": "../../linux-tutorial"
+        "0": "https://docs.hpc.ugent.be/linux-tutorial"
     },
     "previous_title": "account_paragraph_3",
     "next_title": "account_paragraph_5",

diff --git a/...eprocessor/parsed_mds/os_specific/linux/account/account_linux_paragraph_7.1_metadata.json b/...eprocessor/parsed_mds/os_specific/linux/account/account_linux_paragraph_7.1_metadata.json
@@ -6,7 +6,7 @@
     "directory": "account",
     "parent_title": "Getting-ready-to-request-an-account",
     "links": {
-        "0": "../connecting"
+        "0": "https://docs.hpc.ugent.be/connecting"
     },
     "previous_title": "account_paragraph_6",
     "next_title": "account_linux_paragraph_7.2",

diff --git a/...sor/parsed_mds/os_specific/linux/connecting/connecting_linux_paragraph_13.1_metadata.json b/...sor/parsed_mds/os_specific/linux/connecting/connecting_linux_paragraph_13.1_metadata.json
@@ -6,7 +6,7 @@
     "directory": "connecting",
     "parent_title": "Transfer-Files-tofrom-the-HPC",
     "links": {
-        "0": "https://docs.hpc.ugent.be/connecting/localhost:8000/Gent//intro-Linux/uploading_files/#symlinks-for-datascratch"
+        "0": "https://docs.hpc.ugent.be/localhost:8000/Gent//intro-Linux/uploading_files/#symlinks-for-datascratch"
     },
     "previous_title": "connecting_paragraph_12",
     "next_title": "connecting_linux_paragraph_13.2",

diff --git a/...sor/parsed_mds/os_specific/linux/connecting/connecting_linux_paragraph_13.6_metadata.json b/...sor/parsed_mds/os_specific/linux/connecting/connecting_linux_paragraph_13.6_metadata.json
@@ -6,8 +6,8 @@
     "directory": "connecting",
     "parent_title": "Transfer-Files-tofrom-the-HPC",
     "links": {
-        "0": "",
-        "1": ""
+        "0": "https://docs.hpc.ugent.be/",
+        "1": "https://docs.hpc.ugent.be/"
     },
     "previous_title": "connecting_linux_paragraph_13.5",
     "next_title": "connecting_linux_paragraph_13.7",

diff --git a/...ssor/parsed_mds/os_specific/linux/connecting/connecting_linux_paragraph_5.1_metadata.json b/...ssor/parsed_mds/os_specific/linux/connecting/connecting_linux_paragraph_5.1_metadata.json
@@ -6,7 +6,7 @@
     "directory": "connecting",
     "parent_title": "First-Time-connection-to-the-HPC-infrastructure",
     "links": {
-        "0": "https://docs.hpc.ugent.be/connecting/../troubleshooting/#warning-message-when-first-connecting-to-new-host"
+        "0": "https://docs.hpc.ugent.be/troubleshooting/#warning-message-when-first-connecting-to-new-host"
     },
     "previous_title": "connecting_paragraph_4",
     "next_title": "connecting_linux_paragraph_5.2",

diff --git a/...eprocessor/parsed_mds/os_specific/macos/account/account_macos_paragraph_4.1_metadata.json b/...eprocessor/parsed_mds/os_specific/macos/account/account_macos_paragraph_4.1_metadata.json
@@ -6,7 +6,7 @@
     "directory": "account",
     "parent_title": "Getting-ready-to-request-an-account",
     "links": {
-        "0": "../../linux-tutorial"
+        "0": "https://docs.hpc.ugent.be/linux-tutorial"
     },
     "previous_title": "account_paragraph_3",
     "next_title": "account_paragraph_5",

diff --git a/...eprocessor/parsed_mds/os_specific/macos/account/account_macos_paragraph_7.1_metadata.json b/...eprocessor/parsed_mds/os_specific/macos/account/account_macos_paragraph_7.1_metadata.json
@@ -6,7 +6,7 @@
     "directory": "account",
     "parent_title": "Getting-ready-to-request-an-account",
     "links": {
-        "0": "../connecting"
+        "0": "https://docs.hpc.ugent.be/connecting"
     },
     "previous_title": "account_paragraph_6",
     "next_title": "account_macos_paragraph_7.2",

diff --git a/...cessor/parsed_mds/os_specific/windows/account/account_windows_paragraph_4.1_metadata.json b/...cessor/parsed_mds/os_specific/windows/account/account_windows_paragraph_4.1_metadata.json
@@ -6,7 +6,7 @@
     "directory": "account",
     "parent_title": "Getting-ready-to-request-an-account",
     "links": {
-        "0": "../../linux-tutorial"
+        "0": "https://docs.hpc.ugent.be/linux-tutorial"
     },
     "previous_title": "account_paragraph_3",
     "next_title": "account_windows_paragraph_4.2",

diff --git a/...cessor/parsed_mds/os_specific/windows/account/account_windows_paragraph_4.2_metadata.json b/...cessor/parsed_mds/os_specific/windows/account/account_windows_paragraph_4.2_metadata.json
@@ -6,8 +6,8 @@
     "directory": "account",
     "parent_title": "Getting-ready-to-request-an-account",
     "links": {
-        "0": "https://docs.hpc.ugent.be/account/../connecting/#open-a-terminal",
-        "1": "https://docs.hpc.ugent.be/account/../account/#generating-a-publicprivate-key-pair"
+        "0": "https://docs.hpc.ugent.be/connecting/#open-a-terminal",
+        "1": "https://docs.hpc.ugent.be/account/#generating-a-publicprivate-key-pair"
     },
     "previous_title": "account_windows_paragraph_4.1",
     "next_title": "account_windows_paragraph_4.3",

diff --git a/...cessor/parsed_mds/os_specific/windows/account/account_windows_paragraph_4.3_metadata.json b/...cessor/parsed_mds/os_specific/windows/account/account_windows_paragraph_4.3_metadata.json
@@ -6,7 +6,7 @@
     "directory": "account",
     "parent_title": "Getting-ready-to-request-an-account",
     "links": {
-        "0": ""
+        "0": "https://docs.hpc.ugent.be/"
     },
     "previous_title": "account_windows_paragraph_4.2",
     "next_title": "account_windows_paragraph_4.4",

diff --git a/...cessor/parsed_mds/os_specific/windows/account/account_windows_paragraph_6.2_metadata.json b/...cessor/parsed_mds/os_specific/windows/account/account_windows_paragraph_6.2_metadata.json
@@ -6,8 +6,8 @@
     "directory": "account",
     "parent_title": "Getting-ready-to-request-an-account",
     "links": {
-        "0": "https://docs.hpc.ugent.be/account/../account/#generating-a-publicprivate-key-pair",
-        "1": "https://docs.hpc.ugent.be/account/../account/#generating-a-publicprivate-key-pair"
+        "0": "https://docs.hpc.ugent.be/account/#generating-a-publicprivate-key-pair",
+        "1": "https://docs.hpc.ugent.be/account/#generating-a-publicprivate-key-pair"
     },
     "previous_title": "account_windows_paragraph_6.1",
     "next_title": "account_windows_paragraph_6.3",

diff --git a/.../parsed_mds/os_specific/windows/connecting/connecting_windows_paragraph_4.1_metadata.json b/.../parsed_mds/os_specific/windows/connecting/connecting_windows_paragraph_4.1_metadata.json
@@ -6,7 +6,7 @@
     "directory": "connecting",
     "parent_title": "First-Time-connection-to-the-HPC-infrastructure",
     "links": {
-        "0": "https://docs.hpc.ugent.be/connecting/../troubleshooting/#warning-message-when-first-connecting-to-new-host"
+        "0": "https://docs.hpc.ugent.be/troubleshooting/#warning-message-when-first-connecting-to-new-host"
     },
     "previous_title": "connecting_paragraph_3",
     "next_title": "connecting_paragraph_5",