diff --git a/src/plugins/msigdb/__init__.py b/src/plugins/msigdb/__init__.py
index ee398ac8..1623b8c4 100644
--- a/src/plugins/msigdb/__init__.py
+++ b/src/plugins/msigdb/__init__.py
@@ -1,3 +1,2 @@
from .dump import msigdbDumper
from .upload import msigdbUploader
-from .xml_encoder import xmlEncoder
diff --git a/src/plugins/msigdb/dump.py b/src/plugins/msigdb/dump.py
index 5ec7d638..7a17602d 100644
--- a/src/plugins/msigdb/dump.py
+++ b/src/plugins/msigdb/dump.py
@@ -4,6 +4,8 @@
import biothings
import bs4
import config
+import re
+
import lxml.etree as ET
biothings.config_for_app(config)
@@ -64,6 +66,47 @@ def create_todump_list(self, force=False):
# self.to_dump.append({"remote": mouse_url, "local": self.mouse_data_file})
+ def encode_xml(xml_text: str):
+ # Dictionary for replacements
+ replacements = {
+ '&': '&',
+ '': '<sup>',
+ '': '</sup>',
+ '': '<sub>',
+ '': '</sub>',
+ '': '<i>',
+ '': '</i>',
+ '': '<b>',
+ '': '</b>',
+ '
': '<BR/>',
+ '
': '<br/>',
+ ' "TRP-EGL" ': ' "TRP-EGL" ',
+ ' "Treg" ': ' "Treg" ',
+ '=': '</=',
+ '>/=': '>/=',
+ '< ': '< ',
+ '> ': '> ',
+ 'or': '>or',
+ ' > ': ' > ',
+ ' < ': ' < ',
+ ' =< ': ' =< ',
+ ' => ': ' => ',
+ '(': '(',
+ ')': ')'
+ }
+
+ # Apply replacements
+ for pattern, replacement in replacements.items():
+ xml_text = re.sub(re.escape(pattern), replacement, xml_text)
+
+ # Handle remaining cases
+ xml_text = re.sub(r'<([\d_.=-])', r'<\1', xml_text)
+ xml_text = re.sub(r'>([\d_.=-])', r'>\1', xml_text)
+
+ return xml_text
+
+
def sort_xml(self, file, output_file):
"""Sort XML file by organism
Args:
@@ -77,8 +120,7 @@ def sort_xml(self, file, output_file):
xml_text = f.read()
# Encode special characters
- encoder = xmlEncoder()
- encoded_xml_string = encoder.encode_xml(xml_text=xml_text)
+ encoded_xml_string = self.encode_xml(xml_text=xml_text)
# For reference, create a file with encoded string
with open(file + ".encoded", "w", encoding="utf-8") as f:
diff --git a/src/plugins/msigdb/parser.py b/src/plugins/msigdb/parser.py
index ebc791dc..fad92915 100644
--- a/src/plugins/msigdb/parser.py
+++ b/src/plugins/msigdb/parser.py
@@ -1,5 +1,6 @@
import logging
import os
+import re
import lxml.etree as ET
@@ -18,6 +19,47 @@
from utils.mygene_lookup import MyGeneLookup
+def decode_xml(xml_text: str):
+ # Dictionary for replacements
+ replacements = {
+ '&': '&',
+ '<sup>': '',
+ '</sup>': '',
+ '<sub>': '',
+ '</sub>': '',
+ '<i>': '',
+ '</i>': '',
+ '<b>': '',
+ '</b>': '',
+ '<BR/>': '
',
+ '<br/>': '
',
+ '"TRP-EGL"': ' "TRP-EGL" ',
+ '"Treg"': ' "Treg" ',
+ '<=': '=',
+ '>=': '>/=',
+ '< ': '< ',
+ '> ': '> ',
+ '<or': 'or',
+ ' > ': ' > ',
+ ' < ': ' < ',
+ ' =< ': ' =< ',
+ ' => ': ' => ',
+ '(': '(',
+ ')': ')'
+ }
+
+ # Apply replacements
+ for pattern, replacement in replacements.items():
+ xml_text = xml_text.replace(pattern, replacement)
+
+ # Handle remaining cases
+ xml_text = re.sub(r'<([\d_.=-])', r'<\1', xml_text)
+ xml_text = re.sub(r'>([\d_.=-])', r'>\1', xml_text)
+
+ return xml_text
+
+
def parse_msigdb(data_folder):
"""
The XML data provides original gene ids, as well as orthology-converted ones.
@@ -51,9 +93,6 @@ def parse_msigdb(data_folder):
"8": "cell type signature genesets",
}
- # Decode special characters
- encoder = xmlEncoder()
-
for f in ["human_genesets.xml"]:
data_file = os.path.join(data_folder, f)
# File contains newline-delimited XML documents. Each document is a single geneset.
@@ -188,7 +227,7 @@ def parse_msigdb(data_folder):
# Replace & by space in all fields of msigdb
for key, value in msigdb.items():
if isinstance(value, str):
- msigdb[key] = encoder.decode_xml(value)
+ msigdb[key] = decode_xml(value)
doc["msigdb"] = msigdb
# Remove lists with only one item
diff --git a/src/plugins/msigdb/xml_encoder.py b/src/plugins/msigdb/xml_encoder.py
index acda4f66..dd321fd9 100644
--- a/src/plugins/msigdb/xml_encoder.py
+++ b/src/plugins/msigdb/xml_encoder.py
@@ -2,83 +2,5 @@
class xmlEncoder():
- def encode_xml(xml_text: str):
- # Dictionary for replacements
- replacements = {
- '&': '&',
- '': '<sup>',
- '': '</sup>',
- '': '<sub>',
- '': '</sub>',
- '': '<i>',
- '': '</i>',
- '': '<b>',
- '': '</b>',
- '
': '<BR/>',
- '
': '<br/>',
- ' "TRP-EGL" ': ' "TRP-EGL" ',
- ' "Treg" ': ' "Treg" ',
- '=': '</=',
- '>/=': '>/=',
- '< ': '< ',
- '> ': '> ',
- 'or': '>or',
- ' > ': ' > ',
- ' < ': ' < ',
- ' =< ': ' =< ',
- ' => ': ' => ',
- '(': '(',
- ')': ')'
- }
- # Apply replacements
- for pattern, replacement in replacements.items():
- xml_text = re.sub(re.escape(pattern), replacement, xml_text)
- # Handle remaining cases
- xml_text = re.sub(r'<([\d_.=-])', r'<\1', xml_text)
- xml_text = re.sub(r'>([\d_.=-])', r'>\1', xml_text)
-
- return xml_text
-
-
- def decode_xml(xml_text: str):
- # Dictionary for replacements
- replacements = {
- '&': '&',
- '<sup>': '',
- '</sup>': '',
- '<sub>': '',
- '</sub>': '',
- '<i>': '',
- '</i>': '',
- '<b>': '',
- '</b>': '',
- '<BR/>': '
',
- '<br/>': '
',
- '"TRP-EGL"': ' "TRP-EGL" ',
- '"Treg"': ' "Treg" ',
- '<=': '=',
- '>=': '>/=',
- '< ': '< ',
- '> ': '> ',
- '<or': 'or',
- ' > ': ' > ',
- ' < ': ' < ',
- ' =< ': ' =< ',
- ' => ': ' => ',
- '(': '(',
- ')': ')'
- }
-
- # Apply replacements
- for pattern, replacement in replacements.items():
- xml_text = xml_text.replace(pattern, replacement)
-
- # Handle remaining cases
- xml_text = re.sub(r'<([\d_.=-])', r'<\1', xml_text)
- xml_text = re.sub(r'>([\d_.=-])', r'>\1', xml_text)
-
- return xml_text
\ No newline at end of file