diff --git a/any_parser/any_parser.py b/any_parser/any_parser.py index 5fe4a15..bf06a45 100644 --- a/any_parser/any_parser.py +++ b/any_parser/any_parser.py @@ -283,8 +283,6 @@ def async_extract( process_type = ProcessType.FILE elif model == ModelType.PRO: process_type = ProcessType.FILE_REFINED_QUICK - elif model == ModelType.PARSE_WITH_LAYOUT: - process_type = ProcessType.PARSE_WITH_LAYOUT else: return "Error: Invalid model type", None @@ -308,6 +306,44 @@ def async_extract( # If response successful, upload the file return upload_file_to_presigned_url(file_path, response) + def async_parse_with_layout(self, file_path: str) -> str: + """Extract full content from a file asynchronously. + + Compared with `async_extract`, this method will first analyze the layout of the file. + Then it will process text, tables, and images separately; + and return the combined result in markdown format. + + Args: + file_path (str): The path to the file to be parsed. + Returns: + str: The file id of the uploaded file. + """ + file_extension = Path(file_path).suffix.lower().lstrip(".") + + # Check if the file exists and file_type + error = check_file_type_and_path(file_path, file_extension) + + if error: + return error, None + + file_name = Path(file_path).name + # Create the JSON payload + payload = { + "file_name": file_name, + "process_type": "parse_with_layout", + } + + # Send the POST request + response = requests.post( + self._async_upload_url, + headers=self._headers, + data=json.dumps(payload), + timeout=TIMEOUT, + ) + + # If response successful, upload the file + return upload_file_to_presigned_url(file_path, response) + def async_extract_key_value( self, file_path: str, diff --git a/any_parser/utils.py b/any_parser/utils.py index 546f068..ed70fe1 100644 --- a/any_parser/utils.py +++ b/any_parser/utils.py @@ -8,7 +8,6 @@ class ModelType(Enum): BASE = "base" PRO = "pro" - PARSE_WITH_LAYOUT = "parse_with_layout" SUPPORTED_FILE_EXTENSIONS = [ @@ -49,7 +48,7 @@ def upload_file_to_presigned_url( def check_model(model: ModelType) -> None: - if model not in {ModelType.BASE, ModelType.PRO, ModelType.PARSE_WITH_LAYOUT}: + if model not in {ModelType.BASE, ModelType.PRO}: valid_models = ", ".join(["`" + model.value + "`" for model in ModelType]) return f"Invalid model type: {model}. Supported `model` types include {valid_models}." diff --git a/examples/async_parse_with_layout.ipynb b/examples/async_parse_with_layout.ipynb index f2c139d..ac47fe7 100644 --- a/examples/async_parse_with_layout.ipynb +++ b/examples/async_parse_with_layout.ipynb @@ -13,7 +13,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 6, "metadata": {}, "outputs": [], "source": [ @@ -23,7 +23,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 7, "metadata": {}, "outputs": [], "source": [ @@ -32,27 +32,23 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "file_path = \"./sample_data/test_1figure_1table.png\"\n", - "file_id = ap.async_extract(file_path, ModelType.PARSE_WITH_LAYOUT, {})" + "file_id = ap.async_parse_with_layout(file_path)" ] }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 10, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Waiting for response...\n", - "Waiting for response...\n", - "Waiting for response...\n", - "Waiting for response...\n", "Waiting for response...\n", "Waiting for response...\n", "Waiting for response...\n" @@ -65,7 +61,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 11, "metadata": {}, "outputs": [ { @@ -97,17 +93,17 @@ "\n", "5.2 Availability\n", "\n", - "Figure 5 illustrates the availability benefits of running Spanner in multiple datacenters. It shows the results of three experiments on throughput in the presence of datacenter failure, all of which are overlaid onto the same time scale. The test universe consisted of 5 zones Z_i, each of which had 25 spanservers. The test database was sharded into 1250 Paxos groups, and 100 test clients constantly issued non-snapshot reads at an aggregate rate of 50K reads/second. All of the leaders were explicitly placed in Z_1. Five seconds into each test, all of the servers in one zone were killed: non-leader kills Z_2; leader-hard kills Z_1; leader-soft kills Z_1, but it gives notifications to all of the servers that they should handoff leadership first.\n", + "Figure 5 illustrates the availability benefits of running Spanner in multiple datacenters. It shows the results of three experiments on throughput in the presence of datacenter failure, all of which are overlaid onto the same time scale. The test universe consisted of 5 zones Zi, each of which had 25 spanservers. The test database was sharded into 1250 Paxos groups, and 100 test clients constantly issued non-snapshot reads at an aggregate rate of 50K reads/second. All of the leaders were explicitly placed in Z1. Five seconds into each test, all of the servers in one zone were killed: non-leader kills Z2; leader-hard kills Z1; leader-soft kills Z1, but it gives notifications to all of the servers that they should handoff leadership first.\n", "\n", - "Killing Z_2 has no effect on read throughput. Killing Z_1 while giving the leaders time to handoff leadership to a different zone has a minor effect: the throughput drop is not visible in the graph, but is around 3-4%. On the other hand, killing Z_1 with no warning has a severe effect: the rate of completion drops almost to 0. As leaders get re-elected, though, the throughput of the system rises to approximately 100K reads/second because of two artifacts of our experiment: there is extra capacity in the system, and operations are queued while the leader is unavailable. As a result, the throughput of the system rises before leveling off again at its steady-state rate.\n", + "Killing Z2 has no effect on read throughput. Killing Z1 while giving the leaders time to handoff leadership to a different zone has a minor effect: the throughput drop is not visible in the graph, but is around 3-4%. On the other hand, killing Z1 with no warning has a severe effect: the rate of completion drops almost to 0. As leaders get re-elected, though, the throughput of the system rises to approximately 100K reads/second because of two artifacts of our experiment: there is extra capacity in the system, and operations are queued while the leader is unavailable. As a result, the throughput of the system rises before leveling off again at its steady-state rate.\n", "\n", "We can also see the effect of the fact that Paxos leader leases are set to 10 seconds. When we kill the zone, the leader-lease expiration times for the groups should be evenly distributed over the next 10 seconds. Soon after each lease from a dead leader expires, a new leader is elected. Approximately 10 seconds after the kill time, all of the groups have leaders and throughput has recovered. Shorter lease times would reduce the effect of server deaths on availability, but would require greater amounts of lease-renewal network traffic. We are in the process of designing and implementing a mechanism that will cause slaves to release Paxos leader leases upon leader failure.\n", "\n", "5.3 TrueTime\n", "\n", - "Two questions must be answered with respect to TrueTime: is ε truly a bound on clock uncertainty, and how bad does ε get? For the former, the most serious problem would be if a local clock’s drift were greater than 200us/sec: that would break assumptions made by TrueTime. Our machine statistics show that bad CPUs are 6 times more likely than bad clocks. That is, clock issues are extremely infrequent, relative to much more serious hardware problems. As a result, we believe that TrueTime’s implementation is as trustworthy as any other piece of software upon which Spanner depends.\n", + "Two questions must be answered with respect to TrueTime: is ε truly a bound on clock uncertainty, and how bad does ε get? For the former, the most serious problem would be if a local clock’s drift were greater than 200usec/sec: that would break assumptions made by TrueTime. Our machine statistics show that bad CPUs are 6 times more likely than bad clocks. That is, clock issues are extremely infrequent, relative to much more serious hardware problems. As a result, we believe that TrueTime’s implementation is as trustworthy as any other piece of software upon which Spanner depends.\n", "\n", - "![<@mask_p0_e1_figure(timeout=1h)>](https://anyparser-realtime-test-j-assetsconstructfilebucke-2wg0ln280yvz.s3.amazonaws.com/result_parse_with_layout/async_S4iyw7RAEE8CTGkVgHYeI8nsTmSALI1U2HXvAN6j/2024/11/1/test_1figure_1table_f685e88d-d27f-4f1b-9f6c-f03a1fc9ae83.png/%3C%40mask_p0_e1_figure_s3%3E.png?AWSAccessKeyId=ASIAXM24X76XOK7ITU5J&Signature=CLJE%2BOeN6U%2F49Jkd3xm%2FZvguPTM%3D&x-amz-security-token=IQoJb3JpZ2luX2VjECQaCXVzLXdlc3QtMiJHMEUCIQD4c8%2BB73pmEE8VT5NHxpyJlvnko7TgUhRp17lxlf0n1AIgKUhyP7tkU7TiMOraliELOnkaiGJmpFnx8DNKXF8Cq84q3AMInf%2F%2F%2F%2F%2F%2F%2F%2F%2F%2FARAAGgw1MDg2MTEyNjQ0MzAiDKBsIf2UfBx%2BZLkPHCqwA01i7fZyNVUUsZTGQo8kILSfH1ueDrR3JwOdux0bJzq%2BL2g5lx9LpsEz5BL%2BZg7RwkqQwK6AHHNunLmiBU96QfW%2FwvBItqPg%2FzIoAHbmS4WLksKm6c1zXG34etNqrXWgoJ%2FUt3qshGP%2F5TcQYXuIXk%2FL%2Fh8%2Bd2rDTtRDovpWewsPp7hLydxqgNfDihtsY0UzoPKlYK5zzbBShhoqrG3y5AGBF1F9Q%2FFByeW7PiH4OZEwMeQ%2FNZBTNKJ%2BI92iEWbRT0av4t10zL5jlS%2FnH20RBCQsFE%2F5J7Vf4oJyQL8tCMlUHScHqsMvkH%2BEZ4LIxF2cgHwXbzCKWhmI6H44nNC9DM2Ivhy40ETbvPYi3y%2BRgUXxabdnLmmCjz1ls%2Bbqnw1TDx9JjD693KSOSuW7qOikIduS8j4YEdinzKxr6a01JBOeHwb3zUFVwprhqOR2yGy%2FaPjYZN8nuUaH5muRt0KCZTudRvRYobCaxCrXi1I6cicmEPxreaDS43EpIiqfI9n1bhZPNE%2FYqzDvOXZjFM3%2Bcqa1Wwhyiywhv0I0xE%2Ftl%2B5jQe1hH4invJA2H%2FUCZ2vDbDCVlpG5BjqeAQX7PKBMX2QivsKT2kTvqP1F2ByedRRh0tVPBXVyVKudp4skyHUUq8GvCxSLmlH4fwS5KPXFDC6ehQM2RXuHrdkgDhQzGsf15ZwMN%2Bq9aKkqqzXE6U0Ekp8B3Zg4xx8PmlftrHhMpGQCPMz8SPQaT2n9%2B1Aredixh4gT0%2BAQwvcprapl5AAtYLFyMDyj8T8UKTwn2eJ8%2FiR5r1STBOQA&Expires=1730435409)\n", + "![<@mask_p0_e1_figure(timeout=1h)>](https://anyparser-realtime-test-j-assetsconstructfilebucke-2wg0ln280yvz.s3.amazonaws.com/result_parse_with_layout/async_S4iyw7RAEE8CTGkVgHYeI8nsTmSALI1U2HXvAN6j/2024/11/5/test_1figure_1table_9289ba0e-72c5-48bb-be48-1641b94f9a5c.png/%3C%40mask_p0_e1_figure_s3%3E.png?AWSAccessKeyId=ASIAXM24X76XPWKTWXDM&Signature=mZ0m7RjJeSzx0s9CwX4lYiQ6B08%3D&x-amz-security-token=IQoJb3JpZ2luX2VjEIb%2F%2F%2F%2F%2F%2F%2F%2F%2F%2FwEaCXVzLXdlc3QtMiJHMEUCIGD1BvlDiDwHhnQuFgm1FwUpT7dSQvGHeLDwflymFYKvAiEAgRH9Fh7hF%2F%2BwSs0vvYSN1jAn9bsa3BkxfeLyFRvLJ5Mq3AMI%2F%2F%2F%2F%2F%2F%2F%2F%2F%2F%2F%2FARAAGgw1MDg2MTEyNjQ0MzAiDOOkXT2GLdegRIzCISqwAzb7qh6iEu%2FJIwki%2FtLa%2FHGgbKnZ%2F%2FKbsoxnnRWHDWoliSog3oibTehZaRFdOhrQto313dWESjMqGZnC%2B5B91WxR6CWR0yNap7OpI7SVSDi%2BWZG7l2smV8NBMrFyxNIIgr7vhYa44xpfzGbVYYboMZu1u8bHeQxitWKEJlvqvGY26Y7hN23lMFf32wjLpm4pThsf%2FP2TQAQLYgF%2FCTwVoBIUZs4jOl9gkeYi60WW3OhdSCgEUKuBKqodzAMOe6dcKvt7tJz5xOau3JvA3Stq%2FzyjJxuOgn0tErdaqU1AQ1N%2FIbdOB38NVH06PgLWTq4nYAzpRDs7f8q9z3yvh8%2BRkjAFzk5pygylkCT5qsWtTvsthqhP8JialK6bRh2SfMKBV69%2BwEKgaiHZZ9s7VExsBzokoH6qWrynHeaIcViQEA2mxp1ZugfCbBEsZe%2BNNUNRc%2BWurS3cjHxZKs7wB0Zutd6shsw68ph%2Fal5ewzPeKElXfppJ7DRZL6AJzOrQ6t3cRpCvUf%2BGI9Mwkr6JAgZG%2Bgk3F4lP3XmAYVYoatzK13y%2BGOShdhfAtUXZ0cxqikb9MjCC46a5BjqeAQ%2Fv%2Fe4ogTZbLdo1UXs8Dfl%2BpoFmNL7mvRXe6IQigEoocXeXd1svTsYa181ABJxkGvOQ7Gq4wdp%2Bw2pEbOOsh0IDk%2BNCKaffpOU%2BgPVAEN61yNEMtM28f%2FjFRWpC8qQfDjiz1VM9xiqK39iJjJsYcNCB1b6x2VoxKs4rgOUtkZi27eHw%2B7fJvEnrM3aAelAW46dCI%2FYjhRno3ys4p9qm&Expires=1730789387)\n", "\n", "Figure 5: Effect of killing servers on throughput.\n" ], diff --git a/examples/sample_data/resume_1.pdf b/examples/sample_data/resume_1.pdf new file mode 100644 index 0000000..b914f8e Binary files /dev/null and b/examples/sample_data/resume_1.pdf differ diff --git a/examples/sample_data/resume_1.png b/examples/sample_data/resume_1.png new file mode 100644 index 0000000..6a1e276 Binary files /dev/null and b/examples/sample_data/resume_1.png differ