Skip to content

Commit

Permalink
add advanced full context and example
Browse files Browse the repository at this point in the history
  • Loading branch information
Sdddell committed Oct 30, 2024
1 parent 64ec399 commit 439fdc3
Show file tree
Hide file tree
Showing 4 changed files with 149 additions and 1 deletion.
3 changes: 3 additions & 0 deletions any_parser/any_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ class ProcessType(Enum):
TABLE = "table"
FILE_REFINED = "file_refined"
FILE_REFINED_QUICK = "file_refined_quick"
FILE_ADVANCED = "file_advanced"


class AnyParser:
Expand Down Expand Up @@ -221,6 +222,8 @@ def async_extract(
process_type = ProcessType.FILE
elif model == ModelType.PRO:
process_type = ProcessType.FILE_REFINED_QUICK
elif model == ModelType.ADVANCED:
process_type = ProcessType.FILE_ADVANCED
else:
return "Error: Invalid model type", None

Expand Down
3 changes: 2 additions & 1 deletion any_parser/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
class ModelType(Enum):
BASE = "base"
PRO = "pro"
ADVANCED = "advanced"


SUPPORTED_FILE_EXTENSIONS = [
Expand Down Expand Up @@ -48,7 +49,7 @@ def upload_file_to_presigned_url(


def check_model(model: ModelType) -> None:
if model not in {ModelType.BASE, ModelType.PRO}:
if model not in {ModelType.BASE, ModelType.PRO, ModelType.ADVANCED}:
valid_models = ", ".join(["`" + model.value + "`" for model in ModelType])
return f"Invalid model type: {model}. Supported `model` types include {valid_models}."

Expand Down
144 changes: 144 additions & 0 deletions examples/async_advanced_full_context.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,144 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Install the libraries (ipython is used for displaying markdown in this demo)\n",
"# !pip3 install --upgrade ipython\n",
"# !pip3 install --upgrade any-parser"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"from IPython.display import display, Markdown\n",
"from any_parser import AnyParser, ModelType"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"ap = AnyParser(api_key=\"...\")"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
"file_path = \"./sample_data/test_1figure_1table.png\"\n",
"file_id = ap.async_extract(file_path, ModelType.ADVANCED, {})"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Waiting for response...\n"
]
}
],
"source": [
"markdown_output = ap.async_fetch(file_id=file_id)"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"data": {
"text/markdown": [
"\n",
"<table>\n",
"<tbody>\n",
"<tr><td> </td><td>latency </td><td>(ms) </td></tr>\n",
"<tr><td>participants</td><td>mean </td><td>99th percentile</td></tr>\n",
"<tr><td>1 </td><td>17.0 +1.4 </td><td>75.0 34.9 </td></tr>\n",
"<tr><td>2 </td><td>24.5 +2.5 </td><td>87.6 35.9 </td></tr>\n",
"<tr><td>5 </td><td>31.5 +6.2 </td><td>104.5 52.2 </td></tr>\n",
"<tr><td>10 </td><td>30.0 +3.7 </td><td>95.6 25.4 </td></tr>\n",
"<tr><td>25 </td><td>35.5 +5.6 </td><td>100.4 42.7 </td></tr>\n",
"<tr><td>50 </td><td>42.7 4.1 </td><td>93.7 22.9 </td></tr>\n",
"<tr><td>100 </td><td>71.4 +7.6 </td><td>131.2 11.6 </td></tr>\n",
"<tr><td>200 </td><td>150.5 +11.0</td><td>320.3 35.1 </td></tr>\n",
"</tbody>\n",
"</table>\n",
"\n",
"\n",
"\n",
"Table 4: Two-phase commit scalability. Mean and standard deviations over 10 runs.\n",
"\n",
"CPUs. Snapshot reads can execute at any up-to-date replicas, so their throughput increases almost linearly with the number of replicas. Single-read read-only transactions only execute at leaders because timestamp assignment must happen at leaders. Read-only-transaction throughput increases with the number of replicas because the number of effective spanservers increases: in the experimental setup, the number of spanservers equaled the number of replicas, and leaders were randomly distributed among the zones. Write throughput benefits from the same experimental artifact (which explains the increase in throughput from 3 to 5 replicas), but that benefit is outweighed by the linear increase in the amount of work performed per write, as the number of replicas increases.\n",
"\n",
"Table 4 demonstrates that two-phase commit can scale to a reasonable number of participants: it summarizes a set of experiments run across 3 zones, each with 25 spanservers. Scaling up to 50 participants is reasonable in both mean and 99th-percentile, and latencies start to rise noticeably at 100 participants.\n",
"\n",
"5.2 Availability\n",
"\n",
"Figure 5 illustrates the availability benefits of running Spanner in multiple datacenters. It shows the results of three experiments on throughput in the presence of datacenter failure, all of which are overlaid onto the same time scale. The test universe consisted of 5 zones \\(Z_i\\), each of which had 25 spanservers. The test database was sharded into 1250 Paxos groups, and 100 test clients constantly issued non-snapshot reads at an aggregate rate of 50K reads/second. All of the leaders were explicitly placed in \\(Z_1\\). Five seconds into each test, all of the servers in one zone were killed: non-leader kills \\(Z_2\\); leader-hard kills \\(Z_1\\); leader-soft kills \\(Z_1\\), but it gives notifications to all of the servers that they should handoff leadership first.\n",
"\n",
"Killing \\(Z_2\\) has no effect on read throughput. Killing \\(Z_1\\) while giving the leaders time to handoff leadership to a different zone has a minor effect: the throughput drop is not visible in the graph, but is around 3-4%. On the other hand, killing \\(Z_1\\) with no warning has a severe effect: the rate of completion drops almost to 0. As leaders get re-elected, though, the throughput of the system rises to approximately 100K reads/second because of two artifacts of our experiment: there is extra capacity in the system, and operations are queued while the leader is unavailable. As a result, the throughput of the system rises before leveling off again at its steady-state rate.\n",
"\n",
"We can also see the effect of the fact that Paxos leader leases are set to 10 seconds. When we kill the zone, the leader-lease expiration times for the groups should be evenly distributed over the next 10 seconds. Soon after each lease from a dead leader expires, a new leader is elected. Approximately 10 seconds after the kill time, all of the groups have leaders and throughput has recovered. Shorter lease times would reduce the effect of server deaths on availability, but would require greater amounts of lease-renewal network traffic. We are in the process of designing and implementing a mechanism that will cause slaves to release Paxos leader leases upon leader failure.\n",
"\n",
"5.3 TrueTime\n",
"\n",
"Two questions must be answered with respect to TrueTime: is ε truly a bound on clock uncertainty, and how bad does ε get? For the former, the most serious problem would be if a local clock’s drift were greater than 200μsec/sec: that would break assumptions made by TrueTime. Our machine statistics show that bad CPUs are 6 times more likely than bad clocks. That is, clock issues are extremely infrequent, relative to much more serious hardware problems. As a result, we believe that TrueTime’s implementation is as trustworthy as any other piece of software upon which Spanner depends.\n",
"\n",
"Figure 6 presents TrueTime data taken at several thousand spanservers across three datacenters up to 2200\n",
"\n",
"![<@mask_p0_e1_figure(timeout=1h)>](https://anyparser-realtime-test-j-assetsconstructfilebucke-2wg0ln280yvz.s3.amazonaws.com/result_file_advanced/async_S4iyw7RAEE8CTGkVgHYeI8nsTmSALI1U2HXvAN6j/2024/10/29/test_1figure_1table_ca33eeda-23ac-4a1b-a6e3-17317cc54481.png/%3C%40mask_p0_e1_figure_s3%3E.png?AWSAccessKeyId=ASIAXM24X76XPGTHDL75&Signature=1F2FBNLAat4%2FS9v2sCgufTHbnhA%3D&x-amz-security-token=IQoJb3JpZ2luX2VjEN7%2F%2F%2F%2F%2F%2F%2F%2F%2F%2FwEaCXVzLXdlc3QtMiJHMEUCICyyNIpApgz38DKh44HxjxiimKaZtsThrK4LHWp2pvFNAiEAqCNitanW6d9lUNPAFqHPcNsPu9U8fJb2b3tPY5Doagoq0wMIVxAAGgw1MDg2MTEyNjQ0MzAiDG5B0BM7FjSAapU5ziqwA4IRQu%2FLS9Z8jmgEdcO70c5kHJCZ0f5dx3q0pY8Ahxcqnu9KB0omgTjIf7kaGDVI%2FwccE%2B97joOhWo2zERnRM5gWIJAnWvYg5iXFBs3YXbF6%2BaSl8sBQgxRxSIhJZltSk2S2zaKlkLcrfMy17%2BX0rqp1YqZDMBk4dPoaZpTNh1oqwSDEhcKPW7F04ZRUTZcsAYQHE%2BAu7doeaGXCeSI3oViHYOU%2BOdpAjBgWCpr4beZYlwVdUvYU%2F6BqNcauxidyBFMPRQV9ZloBdhkcmtvHqFRr%2Fl3z9s7t4JdXXnIRSxssKlBrlsL8J%2FCCfCxzySA24%2Fp%2BNkiefHcMRA12rORqaV%2BawZXGHeTIjpeLAuARcyDBoe2C7g36Qn%2FRNRAJeM3lJUoJpVhJt6PvaZzL8%2B%2Bu0hV%2FGOwDwh1hL5Ltpr6DE7FEVCem%2Bmmg066QG01fM1XKYCSaCINbRspifY4jDgZvLcFpcFoXVhdKisEehpys6ESUocjzbip9SqdQAaIbRifyCueDauHd5vC%2FgO0kZgTYfNVCqi%2FvQI0iOtpdWEaK%2F3h9XrRkvfKkkEuGIhyfW4NljjCw64G5BjqeAQcgJHPslWjoHbul7HHc6ULF0OkI38W4u%2F4GxmNepvntBsSy3zcSn43vGzbxzczK37VgDvK%2BTNy4t3tyQfUZYI%2Fw308Y1CVFRVyt7wTDe5jHzDUVLtx8jO0eP4kj8CvjWTAHPRSbLiSMmKF0noEbPt16i6pBjcNQltSqApB%2FbySYQxndf09KyPLD8xS%2Fd3k5TM5B5InfGE5KyVnSZEor&Expires=1730184369)\n",
"\n",
"Figure 5: Effect of killing servers on throughput.\n"
],
"text/plain": [
"<IPython.core.display.Markdown object>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"display(Markdown(markdown_output))"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "any",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.10"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Binary file added examples/sample_data/test_1figure_1table.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.

0 comments on commit 439fdc3

Please sign in to comment.