TelentiLab · CristianoYL · Apr 11, 2019 · Apr 11, 2019 · Apr 12, 2019 · Apr 12, 2019
diff --git a/parser.py b/parser.py
@@ -51,58 +51,71 @@ def load_data(data_folder: str):
         count = 0
         skipped = []
         start_time = time.time()
-        for line in file:
-            count += 1
-            ratio = count / file_lines
-            time_left = datetime.timedelta(seconds=(time.time() - start_time) * (1 - ratio) / ratio)
-            # format to use 2 decimals for progress
-            _logger.info(f'reading line {count} ({(ratio * 100):.2f}%), #skipped {len(skipped)}, estimated time left: {time_left}')
-
-            if line.startswith('#') or line.strip() == '':
-                skipped.append(line)
-                continue  # skip commented/empty lines
-
-            try:
-                (chrom, start, end, score, pdb_id, pdb_chain, uniprot_feature_name, pdb_residue_min,
-                 pdb_residue_max) = line.strip().split(DELIMITER)  # unpack according to schema
-            except ValueError:
-                _logger.error(f'failed to unpack line {count}: {line}')
-                _logger.error(f'got: {line.strip().split(DELIMITER)}')
-                skipped.append(line)
-                continue  # skip error line
-
-            try:  # parse each field if necessary (format, enforce datatype etc.)
-                chrom = chrom.replace('chr', '')
-                start = int(start)
-                end = int(end)
-                score = float(score)
-            except ValueError as e:
-                _logger.error(f'failed to cast type for line {count}: {e}')
-                skipped.append(line)
-                continue  # skip error line
-
-            _id = f'chr{chrom}:g.{start}_{end}'  # define id
-
-            variant = {
-                'chrom': chrom,
-                'start': start,
-                'end': end,
-                'score': score,
-                'pdb_id': pdb_id,
-                'pdb_chain': pdb_chain,
-                'uniprot_feature_name': uniprot_feature_name,
-                'pdb_residue_min': pdb_residue_min,
-                'pdb_residue_max': pdb_residue_max
-            }
-
-            _logger.info({
-                "_id": _id,
-                SOURCE_NAME: variant
-            })
-            yield {  # commit an entry by yielding
-                "_id": _id,
-                SOURCE_NAME: variant
-            }
+        res = {}
+        pre_id = None  # init IDs
+        try:
+            for line in file:
+                count += 1
+                ratio = count / file_lines
+                time_left = datetime.timedelta(seconds=(time.time() - start_time) * (1 - ratio) / ratio)
+                # format to use 2 decimals for progress
+                if count % 10000 == 0:  # show progress every 500k records
+                    _logger.info(f'reading line {count} ({(ratio * 100):.2f}%), estimated time left: {time_left}')
+
+                if line.startswith('#') or line.strip() == '':
+                    skipped.append(line)
+                    continue  # skip commented/empty lines
+
+                try:
+                    (chrom, start, end, score, pdb_id, pdb_chain, uniprot_feature_name,
+                     pdb_residue_min, pdb_residue_max) = line.strip().split(DELIMITER)  # unpack according to schema
+                except ValueError:
+                    _logger.error(f'failed to unpack line {count}: {line}')
+                    _logger.error(f'got: {line.strip().split(DELIMITER)}')
+                    skipped.append(line)
+                    continue  # skip error line
+
+                try:  # parse each field if necessary (format, enforce datatype etc.)
+                    chrom = chrom.replace('chr', '')
+                    start = int(start)
+                    end = int(end)
+                    score = float(score)
+                except ValueError as e:
+                    _logger.error(f'failed to cast type for line {count}: {e}')
+                    skipped.append(line)
+                    continue  # skip error line
+
+                _id = f'chr{chrom}:g.{start}_{end}'  # define id
+
+                data = {
+                    'score': score,
+                    'pdb_id': pdb_id,
+                    'pdb_chain': pdb_chain,
+                    'uniprot_feature_name': uniprot_feature_name,
+                    'pdb_residue_min': pdb_residue_min,
+                    'pdb_residue_max': pdb_residue_max,
+                }
+
+                if _id != pre_id:  # current id is different than previous id
+                    if res:  # res not empty (exclude first round)
+                        yield res  # yield previous result
+                    res = {
+                        "_id": _id,
+                        SOURCE_NAME: {
+                            'chrom': chrom,
+                            'start': start,
+                            'end': end,
+                            'scores': [data]
+                        }
+                    }  # take note of current result
+                    pre_id = _id  # take note of current id
+
+                else:  # current id is same as previous id
+                    res.get(SOURCE_NAME).get('scores').append(data)  # merge by appending
+        except StopIteration:   # end of file
+            yield res   # yield last line
+            raise StopIteration  # stop iteration
+
         _logger.info(f'parse completed, {len(skipped)}/{file_lines} lines skipped.')
         for x in skipped:
             _logger.info(f'skipped line: {x.strip()}')