Skip to content

Commit

Permalink
done parsing
Browse files Browse the repository at this point in the history
  • Loading branch information
quadrismegistus committed Sep 19, 2023
1 parent 047c333 commit e5baa13
Showing 1 changed file with 79 additions and 7 deletions.
86 changes: 79 additions & 7 deletions notebooks/001_unpack_ecco.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -640,7 +640,7 @@
},
{
"cell_type": "code",
"execution_count": 16,
"execution_count": 22,
"id": "a13f27c6-9e95-45da-a9a1-64e928f53f19",
"metadata": {},
"outputs": [],
Expand All @@ -653,7 +653,10 @@
" if not text_id in cache:\n",
" paths = textid2paths[text_id]\n",
" cache[text_id] = parse_pages(paths)\n",
"\n"
"\n",
"def get_parsed_pages(text_id):\n",
" with get_page_cache() as cache:\n",
" return cache.get(text_id,{})"
]
},
{
Expand Down Expand Up @@ -712,26 +715,95 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 21,
"id": "69c2b974-dca9-4d71-8e6e-c756a09dfa94",
"metadata": {},
"outputs": [],
"source": [
"# parse_all_pages()"
]
},
{
"cell_type": "code",
"execution_count": 24,
"id": "8378462f-a774-472f-9ea9-bac02cb4dc25",
"metadata": {},
"outputs": [
{
"name": "stderr",
"name": "stdout",
"output_type": "stream",
"text": [
" 1%|▎ | 595/52686 [00:08<11:47, 73.64it/s]"
"3.46 ms ± 403 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n"
]
}
],
"source": [
"parse_all_pages()"
"%%timeit\n",
"eg_id=random.choice(text_ids)\n",
"get_parsed_pages(eg_id)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "7b37ee7b-9ca3-44d3-8335-43508c467005",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "44c4e55c-b627-4655-9317-ad22304eced1",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 25,
"id": "6e7028dd-6fd2-4308-838f-2b1275f58d67",
"metadata": {},
"outputs": [],
"source": [
"def get_parsed_page(text_id, page_id):\n",
" try:\n",
" with get_page_cache() as cache:\n",
" return cache[text_id][page_id]\n",
" except KeyError:\n",
" return {}"
]
},
{
"cell_type": "code",
"execution_count": 28,
"id": "3612193f-feab-4348-a560-949c243a826c",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'pageid': '00010',\n",
" 'assetid': '3328704639',\n",
" 'ocrlanguage': 'English',\n",
" 'ocr': 90.0,\n",
" 'imagelink': '157030013400010.TIF',\n",
" 'text': '\\n\\t\\t\\tTH E: WORKS OF M. DE VO LTAIR E. Vo L. XXXIV. Being Vo L. XXIV. of his PROSE WORKS.\\n\\t\\t'}"
]
},
"execution_count": 28,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"get_parsed_page(eg_id,'00010')"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "ca862887-2503-4e98-80bf-358921246a9a",
"id": "f0f390be-cc56-4253-8068-880b11285f42",
"metadata": {},
"outputs": [],
"source": []
Expand Down

0 comments on commit e5baa13

Please sign in to comment.