Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add option to generate a nucl database from the Uniprot core proteins #89

Open
wants to merge 4 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
62 changes: 47 additions & 15 deletions phylophlan/phylophlan_setup_database.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
from requests.adapters import HTTPAdapter
from requests.adapters import Retry
import time
import xmlschema


if sys.version_info[0] < 3:
Expand Down Expand Up @@ -144,8 +145,12 @@ def check_params(args, verbose=False):
info('Setting output folder "{}"\n'.format(args.output))

args.input = args.output
args.input_extension = PROTEOME_EXTENSION
args.output_extension = PROTEOME_EXTENSION
if args.db_type == "n" or args.output_extension == ".fna":
args.input_extension = GENOME_EXTENSION
args.output_extension = GENOME_EXTENSION
else:
args.input_extension = PROTEOME_EXTENSION
args.output_extension = PROTEOME_EXTENSION

if not args.db_name:
args.db_name = args.get_core_proteins
Expand All @@ -171,7 +176,7 @@ def check_params(args, verbose=False):

if verbose:
info('Setting output extension to "{}"\n'.format(args.output_extension))
else:
elif not args.get_core_proteins:
error("both -t/--db_type and -x/--output_extension were specified, don't know which one to use!",
exit=True)

Expand Down Expand Up @@ -422,6 +427,11 @@ def get_core_proteins(taxa2core_file, taxa_label, output, output_extension, verb
# # taxid taxonomy UniRefURL core_list
# 12345 k__|...|s__ http://..{}.. UP123;UP456;...


if output_extension == GENOME_EXTENSION:
# use Uniprot XML scheme to retrieve nucleotide sequences via ENA
schema = xmlschema.XMLSchema("https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/uniprot.xsd")

for r in bz2.open(taxa2core_file, 'rt'):
if r.startswith('#'):
continue
Expand Down Expand Up @@ -451,11 +461,32 @@ def get_core_proteins(taxa2core_file, taxa_label, output, output_extension, verb
info('Downloading {} core proteins for {}\n'.format(len(core_prots), lbl))

for core_prot in core_prots:
local_prot = os.path.join(output, core_prot + output_extension)
download(url.format(core_prot), local_prot, verbose=verbose)
if output_extension == PROTEOME_EXTENSION:
# Download the protein sequences
local_prot = os.path.join(output, core_prot + output_extension)
download(url.format(core_prot), local_prot, verbose=verbose)

if not os.path.exists(local_prot):
retry2download.append(core_prot)
elif output_extension == GENOME_EXTENSION:
# Download the nucleotide sequences
urlretrieve(f"https://www.uniprot.org/uniprot/{core_prot}.xml",
f"{output}/{core_prot}.xml")
try:
accid_entry = schema.to_dict(f'{output}/{core_prot}.xml')
embl_accid = ""
for dbentry in accid_entry['entry'][0]['dbReference']:
if dbentry['@type'] == "EMBL":
embl_accid = dbentry['property'][0]['@value']
urlretrieve(f"https://www.ebi.ac.uk/ena/browser/api/fasta/{embl_accid}",
f"{output}/{core_prot}{output_extension}")
break
if embl_accid == "":
not_mapped.append(core_prot)
except xmlschema.XMLSchemaChildrenValidationError:
not_mapped.append(core_prot)
os.remove(f'{output}/{core_prot}.xml')

if not os.path.exists(local_prot):
retry2download.append(core_prot)

# try to re-map the ids in case "not mapped" store in not_mapped
if retry2download:
Expand All @@ -477,16 +508,16 @@ def get_core_proteins(taxa2core_file, taxa_label, output, output_extension, verb
if not os.path.exists(local_prot):
not_mapped_again.append(core_prot)

# really don't know what else to try... I'm sorry!
if not_mapped_again:
nd_out = os.path.join(output, taxa_label + '_core_proteins_not_mapped.txt')
# really don't know what else to try... I'm sorry!
if not_mapped_again:
nd_out = os.path.join(output, taxa_label + '_core_proteins_not_mapped.txt')

if verbose:
info('There are {} core proteins that could not be downloaded, writing thier IDs to "{}"\n'
.format(len(not_mapped_again), nd_out))
if verbose:
info('There are {} core proteins that could not be downloaded, writing thier IDs to "{}"\n'
.format(len(not_mapped_again), nd_out))

with open(nd_out, 'w') as f:
f.write('\n'.join(not_mapped_again) + '\n')
with open(nd_out, 'w') as f:
f.write('\n'.join(not_mapped_again) + '\n')


def resolve_IDs(IDs_list, verbose=False):
Expand Down Expand Up @@ -543,6 +574,7 @@ def create_database(db_name, inputt, input_ext, output, overwrite, verbose=False

def phylophlan_setup_database():
args = read_params()
print(args)

if args.verbose:
info('phylophlan_setup_database.py version {} ({})\n'.format(__version__, __date__))
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
sys.stdout.write('PhyloPhlAn requires Python 3 or higher. Please update you Python installation')


install_reqs = ["biopython", "dendropy", "matplotlib", "numpy", "pandas", "seaborn"]
install_reqs = ["biopython", "dendropy", "matplotlib", "numpy", "pandas", "seaborn", "xmlschema"]

setuptools.setup(name='PhyloPhlAn',
version='3.0.3',
Expand Down