diff --git a/src/plugins/msigdb/__init__.py b/src/plugins/msigdb/__init__.py index ee398ac8..1623b8c4 100644 --- a/src/plugins/msigdb/__init__.py +++ b/src/plugins/msigdb/__init__.py @@ -1,3 +1,2 @@ from .dump import msigdbDumper from .upload import msigdbUploader -from .xml_encoder import xmlEncoder diff --git a/src/plugins/msigdb/dump.py b/src/plugins/msigdb/dump.py index 5ec7d638..7a17602d 100644 --- a/src/plugins/msigdb/dump.py +++ b/src/plugins/msigdb/dump.py @@ -4,6 +4,8 @@ import biothings import bs4 import config +import re + import lxml.etree as ET biothings.config_for_app(config) @@ -64,6 +66,47 @@ def create_todump_list(self, force=False): # self.to_dump.append({"remote": mouse_url, "local": self.mouse_data_file}) + def encode_xml(xml_text: str): + # Dictionary for replacements + replacements = { + '&': '&', + '': '<sup>', + '': '</sup>', + '': '<sub>', + '': '</sub>', + '': '<i>', + '': '</i>', + '': '<b>', + '': '</b>', + '
': '<BR/>', + '
': '<br/>', + ' "TRP-EGL" ': ' "TRP-EGL" ', + ' "Treg" ': ' "Treg" ', + '/=': '>/=', + '< ': '< ', + '> ': '> ', + 'or': '>or', + ' > ': ' > ', + ' < ': ' < ', + ' =< ': ' =< ', + ' => ': ' => ', + '(': '(', + ')': ')' + } + + # Apply replacements + for pattern, replacement in replacements.items(): + xml_text = re.sub(re.escape(pattern), replacement, xml_text) + + # Handle remaining cases + xml_text = re.sub(r'<([\d_.=-])', r'<\1', xml_text) + xml_text = re.sub(r'>([\d_.=-])', r'>\1', xml_text) + + return xml_text + + def sort_xml(self, file, output_file): """Sort XML file by organism Args: @@ -77,8 +120,7 @@ def sort_xml(self, file, output_file): xml_text = f.read() # Encode special characters - encoder = xmlEncoder() - encoded_xml_string = encoder.encode_xml(xml_text=xml_text) + encoded_xml_string = self.encode_xml(xml_text=xml_text) # For reference, create a file with encoded string with open(file + ".encoded", "w", encoding="utf-8") as f: diff --git a/src/plugins/msigdb/parser.py b/src/plugins/msigdb/parser.py index ebc791dc..fad92915 100644 --- a/src/plugins/msigdb/parser.py +++ b/src/plugins/msigdb/parser.py @@ -1,5 +1,6 @@ import logging import os +import re import lxml.etree as ET @@ -18,6 +19,47 @@ from utils.mygene_lookup import MyGeneLookup +def decode_xml(xml_text: str): + # Dictionary for replacements + replacements = { + '&': '&', + '<sup>': '', + '</sup>': '', + '<sub>': '', + '</sub>': '', + '<i>': '', + '</i>': '', + '<b>': '', + '</b>': '', + '<BR/>': '
', + '<br/>': '
', + '"TRP-EGL"': ' "TRP-EGL" ', + '"Treg"': ' "Treg" ', + '<=': '/=', + '< ': '< ', + '> ': '> ', + '<or': 'or', + ' > ': ' > ', + ' < ': ' < ', + ' =< ': ' =< ', + ' => ': ' => ', + '(': '(', + ')': ')' + } + + # Apply replacements + for pattern, replacement in replacements.items(): + xml_text = xml_text.replace(pattern, replacement) + + # Handle remaining cases + xml_text = re.sub(r'<([\d_.=-])', r'<\1', xml_text) + xml_text = re.sub(r'>([\d_.=-])', r'>\1', xml_text) + + return xml_text + + def parse_msigdb(data_folder): """ The XML data provides original gene ids, as well as orthology-converted ones. @@ -51,9 +93,6 @@ def parse_msigdb(data_folder): "8": "cell type signature genesets", } - # Decode special characters - encoder = xmlEncoder() - for f in ["human_genesets.xml"]: data_file = os.path.join(data_folder, f) # File contains newline-delimited XML documents. Each document is a single geneset. @@ -188,7 +227,7 @@ def parse_msigdb(data_folder): # Replace & by space in all fields of msigdb for key, value in msigdb.items(): if isinstance(value, str): - msigdb[key] = encoder.decode_xml(value) + msigdb[key] = decode_xml(value) doc["msigdb"] = msigdb # Remove lists with only one item diff --git a/src/plugins/msigdb/xml_encoder.py b/src/plugins/msigdb/xml_encoder.py index acda4f66..dd321fd9 100644 --- a/src/plugins/msigdb/xml_encoder.py +++ b/src/plugins/msigdb/xml_encoder.py @@ -2,83 +2,5 @@ class xmlEncoder(): - def encode_xml(xml_text: str): - # Dictionary for replacements - replacements = { - '&': '&', - '': '<sup>', - '': '</sup>', - '': '<sub>', - '': '</sub>', - '': '<i>', - '': '</i>', - '': '<b>', - '': '</b>', - '
': '<BR/>', - '
': '<br/>', - ' "TRP-EGL" ': ' "TRP-EGL" ', - ' "Treg" ': ' "Treg" ', - '/=': '>/=', - '< ': '< ', - '> ': '> ', - 'or': '>or', - ' > ': ' > ', - ' < ': ' < ', - ' =< ': ' =< ', - ' => ': ' => ', - '(': '(', - ')': ')' - } - # Apply replacements - for pattern, replacement in replacements.items(): - xml_text = re.sub(re.escape(pattern), replacement, xml_text) - # Handle remaining cases - xml_text = re.sub(r'<([\d_.=-])', r'<\1', xml_text) - xml_text = re.sub(r'>([\d_.=-])', r'>\1', xml_text) - - return xml_text - - - def decode_xml(xml_text: str): - # Dictionary for replacements - replacements = { - '&': '&', - '<sup>': '', - '</sup>': '', - '<sub>': '', - '</sub>': '', - '<i>': '', - '</i>': '', - '<b>': '', - '</b>': '', - '<BR/>': '
', - '<br/>': '
', - '"TRP-EGL"': ' "TRP-EGL" ', - '"Treg"': ' "Treg" ', - '<=': '/=', - '< ': '< ', - '> ': '> ', - '<or': 'or', - ' > ': ' > ', - ' < ': ' < ', - ' =< ': ' =< ', - ' => ': ' => ', - '(': '(', - ')': ')' - } - - # Apply replacements - for pattern, replacement in replacements.items(): - xml_text = xml_text.replace(pattern, replacement) - - # Handle remaining cases - xml_text = re.sub(r'<([\d_.=-])', r'<\1', xml_text) - xml_text = re.sub(r'>([\d_.=-])', r'>\1', xml_text) - - return xml_text \ No newline at end of file