Skip to content

Commit

Permalink
Clean code for msigdb.
Browse files Browse the repository at this point in the history
  • Loading branch information
everaldorodrigo committed Apr 1, 2024
1 parent 243c800 commit 31d9e32
Show file tree
Hide file tree
Showing 4 changed files with 87 additions and 85 deletions.
1 change: 0 additions & 1 deletion src/plugins/msigdb/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,2 @@
from .dump import msigdbDumper
from .upload import msigdbUploader
from .xml_encoder import xmlEncoder
46 changes: 44 additions & 2 deletions src/plugins/msigdb/dump.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@
import biothings
import bs4
import config
import re

import lxml.etree as ET

biothings.config_for_app(config)
Expand Down Expand Up @@ -64,6 +66,47 @@ def create_todump_list(self, force=False):
# self.to_dump.append({"remote": mouse_url, "local": self.mouse_data_file})


def encode_xml(xml_text: str):
# Dictionary for replacements
replacements = {
'&': '&',
'<sup>': '&lt;sup&gt;',
'</sup>': '&lt;/sup&gt;',
'<sub>': '&lt;sub&gt;',
'</sub>': '&lt;/sub&gt;',
'<i>': '&lt;i&gt;',
'</i>': '&lt;/i&gt;',
'<b>': '&lt;b&gt;',
'</b>': '&lt;/b&gt;',
'<BR/>': '&lt;BR/&gt;',
'<br/>': '&lt;br/&gt;',
' "TRP-EGL" ': ' &quot;TRP-EGL&quot; ',
' "Treg" ': ' &quot;Treg&quot; ',
'</=': '&lt;/=',
'>/=': '&gt;/=',
'< ': '&lt; ',
'> ': '&gt; ',
'<or': '&lt;or',
'>or': '&gt;or',
' > ': ' &gt; ',
' < ': ' &lt; ',
' =< ': ' =&lt; ',
' => ': ' =&gt; ',
'(': '&#40;',
')': '&#41;'
}

# Apply replacements
for pattern, replacement in replacements.items():
xml_text = re.sub(re.escape(pattern), replacement, xml_text)

# Handle remaining cases
xml_text = re.sub(r'<([\d_.=-])', r'&lt;\1', xml_text)
xml_text = re.sub(r'>([\d_.=-])', r'&gt;\1', xml_text)

return xml_text


def sort_xml(self, file, output_file):
"""Sort XML file by organism
Args:
Expand All @@ -77,8 +120,7 @@ def sort_xml(self, file, output_file):
xml_text = f.read()

# Encode special characters
encoder = xmlEncoder()
encoded_xml_string = encoder.encode_xml(xml_text=xml_text)
encoded_xml_string = self.encode_xml(xml_text=xml_text)

# For reference, create a file with encoded string
with open(file + ".encoded", "w", encoding="utf-8") as f:
Expand Down
47 changes: 43 additions & 4 deletions src/plugins/msigdb/parser.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import logging
import os
import re

import lxml.etree as ET

Expand All @@ -18,6 +19,47 @@
from utils.mygene_lookup import MyGeneLookup


def decode_xml(xml_text: str):
# Dictionary for replacements
replacements = {
'&amp;': '&',
'&lt;sup&gt;': '<sup>',
'&lt;/sup&gt;': '</sup>',
'&lt;sub&gt;': '<sub>',
'&lt;/sub&gt;': '</sub>',
'&lt;i&gt;': '<i>',
'&lt;/i&gt;': '</i>',
'&lt;b&gt;': '<b>',
'&lt;/b&gt;': '</b>',
'&lt;BR/&gt;': '<BR/>',
'&lt;br/&gt;': '<br/>',
'&quot;TRP-EGL&quot;': ' "TRP-EGL" ',
'&quot;Treg&quot;': ' "Treg" ',
'&lt;=': '</=',
'&gt;=': '>/=',
'&lt; ': '< ',
'&gt; ': '> ',
'&lt;or': '<or',
'&gt;or': '>or',
' &gt; ': ' > ',
' &lt; ': ' < ',
' =&lt; ': ' =< ',
' =&gt; ': ' => ',
'&#40;': '(',
'&#41;': ')'
}

# Apply replacements
for pattern, replacement in replacements.items():
xml_text = xml_text.replace(pattern, replacement)

# Handle remaining cases
xml_text = re.sub(r'&lt;([\d_.=-])', r'<\1', xml_text)
xml_text = re.sub(r'&gt;([\d_.=-])', r'>\1', xml_text)

return xml_text


def parse_msigdb(data_folder):
"""
The XML data provides original gene ids, as well as orthology-converted ones.
Expand Down Expand Up @@ -51,9 +93,6 @@ def parse_msigdb(data_folder):
"8": "cell type signature genesets",
}

# Decode special characters
encoder = xmlEncoder()

for f in ["human_genesets.xml"]:
data_file = os.path.join(data_folder, f)
# File contains newline-delimited XML documents. Each document is a single geneset.
Expand Down Expand Up @@ -188,7 +227,7 @@ def parse_msigdb(data_folder):
# Replace &amp; by space in all fields of msigdb
for key, value in msigdb.items():
if isinstance(value, str):
msigdb[key] = encoder.decode_xml(value)
msigdb[key] = decode_xml(value)

doc["msigdb"] = msigdb
# Remove lists with only one item
Expand Down
78 changes: 0 additions & 78 deletions src/plugins/msigdb/xml_encoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,83 +2,5 @@

class xmlEncoder():

def encode_xml(xml_text: str):
# Dictionary for replacements
replacements = {
'&': '&amp;',
'<sup>': '&lt;sup&gt;',
'</sup>': '&lt;/sup&gt;',
'<sub>': '&lt;sub&gt;',
'</sub>': '&lt;/sub&gt;',
'<i>': '&lt;i&gt;',
'</i>': '&lt;/i&gt;',
'<b>': '&lt;b&gt;',
'</b>': '&lt;/b&gt;',
'<BR/>': '&lt;BR/&gt;',
'<br/>': '&lt;br/&gt;',
' "TRP-EGL" ': ' &quot;TRP-EGL&quot; ',
' "Treg" ': ' &quot;Treg&quot; ',
'</=': '&lt;/=',
'>/=': '&gt;/=',
'< ': '&lt; ',
'> ': '&gt; ',
'<or': '&lt;or',
'>or': '&gt;or',
' > ': ' &gt; ',
' < ': ' &lt; ',
' =< ': ' =&lt; ',
' => ': ' =&gt; ',
'(': '&#40;',
')': '&#41;'
}

# Apply replacements
for pattern, replacement in replacements.items():
xml_text = re.sub(re.escape(pattern), replacement, xml_text)

# Handle remaining cases
xml_text = re.sub(r'<([\d_.=-])', r'&lt;\1', xml_text)
xml_text = re.sub(r'>([\d_.=-])', r'&gt;\1', xml_text)

return xml_text


def decode_xml(xml_text: str):
# Dictionary for replacements
replacements = {
'&amp;': '&',
'&lt;sup&gt;': '<sup>',
'&lt;/sup&gt;': '</sup>',
'&lt;sub&gt;': '<sub>',
'&lt;/sub&gt;': '</sub>',
'&lt;i&gt;': '<i>',
'&lt;/i&gt;': '</i>',
'&lt;b&gt;': '<b>',
'&lt;/b&gt;': '</b>',
'&lt;BR/&gt;': '<BR/>',
'&lt;br/&gt;': '<br/>',
'&quot;TRP-EGL&quot;': ' "TRP-EGL" ',
'&quot;Treg&quot;': ' "Treg" ',
'&lt;=': '</=',
'&gt;=': '>/=',
'&lt; ': '< ',
'&gt; ': '> ',
'&lt;or': '<or',
'&gt;or': '>or',
' &gt; ': ' > ',
' &lt; ': ' < ',
' =&lt; ': ' =< ',
' =&gt; ': ' => ',
'&#40;': '(',
'&#41;': ')'
}

# Apply replacements
for pattern, replacement in replacements.items():
xml_text = xml_text.replace(pattern, replacement)

# Handle remaining cases
xml_text = re.sub(r'&lt;([\d_.=-])', r'<\1', xml_text)
xml_text = re.sub(r'&gt;([\d_.=-])', r'>\1', xml_text)

return xml_text

0 comments on commit 31d9e32

Please sign in to comment.