Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix dup key #1

Open
wants to merge 4 commits into
base: master
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
117 changes: 65 additions & 52 deletions parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,58 +51,71 @@ def load_data(data_folder: str):
count = 0
skipped = []
start_time = time.time()
for line in file:
count += 1
ratio = count / file_lines
time_left = datetime.timedelta(seconds=(time.time() - start_time) * (1 - ratio) / ratio)
# format to use 2 decimals for progress
_logger.info(f'reading line {count} ({(ratio * 100):.2f}%), #skipped {len(skipped)}, estimated time left: {time_left}')

if line.startswith('#') or line.strip() == '':
skipped.append(line)
continue # skip commented/empty lines

try:
(chrom, start, end, score, pdb_id, pdb_chain, uniprot_feature_name, pdb_residue_min,
pdb_residue_max) = line.strip().split(DELIMITER) # unpack according to schema
except ValueError:
_logger.error(f'failed to unpack line {count}: {line}')
_logger.error(f'got: {line.strip().split(DELIMITER)}')
skipped.append(line)
continue # skip error line

try: # parse each field if necessary (format, enforce datatype etc.)
chrom = chrom.replace('chr', '')
start = int(start)
end = int(end)
score = float(score)
except ValueError as e:
_logger.error(f'failed to cast type for line {count}: {e}')
skipped.append(line)
continue # skip error line

_id = f'chr{chrom}:g.{start}_{end}' # define id

variant = {
'chrom': chrom,
'start': start,
'end': end,
'score': score,
'pdb_id': pdb_id,
'pdb_chain': pdb_chain,
'uniprot_feature_name': uniprot_feature_name,
'pdb_residue_min': pdb_residue_min,
'pdb_residue_max': pdb_residue_max
}

_logger.info({
"_id": _id,
SOURCE_NAME: variant
})
yield { # commit an entry by yielding
"_id": _id,
SOURCE_NAME: variant
}
res = {}
pre_id = None # init IDs
try:
for line in file:
count += 1
ratio = count / file_lines
time_left = datetime.timedelta(seconds=(time.time() - start_time) * (1 - ratio) / ratio)
# format to use 2 decimals for progress
if count % 10000 == 0: # show progress every 500k records
_logger.info(f'reading line {count} ({(ratio * 100):.2f}%), estimated time left: {time_left}')

if line.startswith('#') or line.strip() == '':
skipped.append(line)
continue # skip commented/empty lines

try:
(chrom, start, end, score, pdb_id, pdb_chain, uniprot_feature_name,
pdb_residue_min, pdb_residue_max) = line.strip().split(DELIMITER) # unpack according to schema
except ValueError:
_logger.error(f'failed to unpack line {count}: {line}')
_logger.error(f'got: {line.strip().split(DELIMITER)}')
skipped.append(line)
continue # skip error line

try: # parse each field if necessary (format, enforce datatype etc.)
chrom = chrom.replace('chr', '')
start = int(start)
end = int(end)
score = float(score)
except ValueError as e:
_logger.error(f'failed to cast type for line {count}: {e}')
skipped.append(line)
continue # skip error line

_id = f'chr{chrom}:g.{start}_{end}' # define id

data = {
'score': score,
'pdb_id': pdb_id,
'pdb_chain': pdb_chain,
'uniprot_feature_name': uniprot_feature_name,
'pdb_residue_min': pdb_residue_min,
'pdb_residue_max': pdb_residue_max,
}

if _id != pre_id: # current id is different than previous id
if res: # res not empty (exclude first round)
yield res # yield previous result
res = {
"_id": _id,
SOURCE_NAME: {
'chrom': chrom,
'start': start,
'end': end,
'scores': [data]
}
} # take note of current result
pre_id = _id # take note of current id

else: # current id is same as previous id
res.get(SOURCE_NAME).get('scores').append(data) # merge by appending
except StopIteration: # end of file
yield res # yield last line
raise StopIteration # stop iteration

_logger.info(f'parse completed, {len(skipped)}/{file_lines} lines skipped.')
for x in skipped:
_logger.info(f'skipped line: {x.strip()}')