From 4173615061759522c25275c34b74b995be10dce8 Mon Sep 17 00:00:00 2001 From: james-rms Date: Tue, 30 Apr 2024 10:07:07 +1000 Subject: [PATCH 01/44] go: reduce memory footprint of writer (#1164) ### Changelog - Changed: reduced memory footprint of writer by removing an uneccessary chunk buffer. ### Description We are unneccessarily copying the entire compressed chunk into an internal buffer before writing it out - this increases the memory footprint of an mcap writer by the size of a compressed chunk. This isn't too important usually, but when you're running an application that writes 200 MCAP files at once, those 4MB buffers can add up. This PR removes that chunk buffer and instead uses the `writer.msg` scratch buffer to marshal only the "header" of the chunk. This is all fields of the chunk record besides the compressed data. This PR also adds a benchmark to demonstrate the memory saving. This benchmark is similar to the existing WriterAllocs benchmark, but: - we allocate 100 writers and write the nth message to the (n % 100)th writer - instead of writing the output MCAP to a buffer, the 100 writers all write to io.Discard. This is because we don't care about the output result, and we also want to reduce the effect of the allocated output on the result as much as possible. Benchmark results: ``` Before: (python-sNIFi2pF) j@192-168-1-105 mcap % go test ./... -bench=BenchmarkManyWriterAllocs -benchtime=10s -benchmem goos: darwin goarch: arm64 pkg: github.com/foxglove/mcap/go/mcap BenchmarkManyWriterAllocs/big_chunks_many_messages-8 22 510907131 ns/op 4038205 messages/sec 515778960 B/op 48796 allocs/op BenchmarkManyWriterAllocs/small_chunks_many_messages-8 25 458119757 ns/op 4355813 messages/sec 11032190 B/op 79796 allocs/op BenchmarkManyWriterAllocs/many_channels-8 2 5889904000 ns/op 345263 messages/sec 4025878024 B/op 22587677 allocs/op PASS After: (python-sNIFi2pF) j@192-168-1-105 mcap % go test ./... -bench=BenchmarkManyWriterAllocs -benchtime=10s -benchmem goos: darwin goarch: arm64 pkg: github.com/foxglove/mcap/go/mcap BenchmarkManyWriterAllocs/big_chunks_many_messages-8 22 513380782 ns/op 3988771 messages/sec 341286355 B/op 48698 allocs/op BenchmarkManyWriterAllocs/small_chunks_many_messages-8 25 464017923 ns/op 4417701 messages/sec 8853856 B/op 79700 allocs/op BenchmarkManyWriterAllocs/many_channels-8 2 5858519938 ns/op 347740 messages/sec 2881396708 B/op 22587001 allocs/op PASS ok github.com/foxglove/mcap/go/mcap 43.179s ``` As expected, the write speed is roughly the same, but far fewer bytes are allocated in all benchmark cases.
BeforeAfter
MCAP writer copies the full chunk record into a buffer before writing that buffer out to the writer. MCAP writer copies the chunk header into a buffer, writes it out, then writes out the compressed chunk data.
--- go/mcap/version.go | 2 +- go/mcap/writer.go | 34 ++++++++-------- go/mcap/writer_test.go | 92 ++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 111 insertions(+), 17 deletions(-) diff --git a/go/mcap/version.go b/go/mcap/version.go index 04a419096e..ceb3bc1aea 100644 --- a/go/mcap/version.go +++ b/go/mcap/version.go @@ -1,4 +1,4 @@ package mcap // Version of the MCAP library. -var Version = "v1.2.0" +var Version = "v1.3.1" diff --git a/go/mcap/writer.go b/go/mcap/writer.go index 681d73630d..6a0e71d5a4 100644 --- a/go/mcap/writer.go +++ b/go/mcap/writer.go @@ -38,7 +38,6 @@ type Writer struct { w *writeSizer buf []byte msg []byte - chunk []byte uncompressed *bytes.Buffer compressed *bytes.Buffer compressedWriter *countingCRCWriter @@ -434,7 +433,9 @@ func (w *Writer) flushActiveChunk() error { crc := w.compressedWriter.CRC() compressedlen := w.compressed.Len() uncompressedlen := w.compressedWriter.Size() - msglen := 8 + 8 + 8 + 4 + 4 + len(w.opts.Compression) + 8 + compressedlen + // the "top fields" are all fields of the chunk record except for the compressed records. + topFieldsLen := 8 + 8 + 8 + 4 + 4 + len(w.opts.Compression) + 8 + msglen := topFieldsLen + compressedlen chunkStartOffset := w.w.Size() var start, end uint64 if w.currentChunkMessageCount != 0 { @@ -445,24 +446,25 @@ func (w *Writer) flushActiveChunk() error { // when writing a chunk, we don't go through writerecord to avoid needing to // materialize the compressed data again. Instead, write the leading bytes // then copy from the compressed data buffer. - recordlen := 1 + 8 + msglen - if len(w.chunk) < recordlen { - w.chunk = make([]byte, recordlen*2) - } - offset, err := putByte(w.chunk, byte(OpChunk)) + recordHeaderLen := 1 + 8 + topFieldsLen + w.ensureSized(recordHeaderLen) + offset, err := putByte(w.msg, byte(OpChunk)) if err != nil { return err } - offset += putUint64(w.chunk[offset:], uint64(msglen)) - offset += putUint64(w.chunk[offset:], start) - offset += putUint64(w.chunk[offset:], end) - offset += putUint64(w.chunk[offset:], uint64(uncompressedlen)) - offset += putUint32(w.chunk[offset:], crc) - offset += putPrefixedString(w.chunk[offset:], string(w.opts.Compression)) - offset += putUint64(w.chunk[offset:], uint64(w.compressed.Len())) - offset += copy(w.chunk[offset:recordlen], w.compressed.Bytes()) - _, err = w.w.Write(w.chunk[:offset]) + offset += putUint64(w.msg[offset:], uint64(msglen)) + offset += putUint64(w.msg[offset:], start) + offset += putUint64(w.msg[offset:], end) + offset += putUint64(w.msg[offset:], uint64(uncompressedlen)) + offset += putUint32(w.msg[offset:], crc) + offset += putPrefixedString(w.msg[offset:], string(w.opts.Compression)) + offset += putUint64(w.msg[offset:], uint64(w.compressed.Len())) + _, err = w.w.Write(w.msg[:offset]) + if err != nil { + return err + } + _, err = w.w.Write(w.compressed.Bytes()) if err != nil { return err } diff --git a/go/mcap/writer_test.go b/go/mcap/writer_test.go index 59e5e9303b..f562271159 100644 --- a/go/mcap/writer_test.go +++ b/go/mcap/writer_test.go @@ -741,3 +741,95 @@ func TestBYOCompressor(t *testing.T) { assertReadable(t, bytes.NewReader(buf.Bytes())) assert.Positive(t, blockCount) } + +func BenchmarkManyWriterAllocs(b *testing.B) { + cases := []struct { + assertion string + chunkSize int + messageCount int + channelCount int + }{ + { + "big chunks many messages", + 8 * 1024 * 1024, + 2e6, + 100, + }, + { + "small chunks many messages", + 8 * 1024, + 2e6, + 100, + }, + { + "many channels", + 4 * 1024 * 1024, + 2e6, + 55000, + }, + } + + stringData := "hello, world!" + messageData := []byte("hello, world") + schema := Schema{ + Name: stringData, + Encoding: "ros1msg", + Data: messageData, + } + channel := Channel{ + Topic: stringData, + MessageEncoding: "msg", + Metadata: map[string]string{ + "": "", + }, + } + message := Message{ + Sequence: 0, + Data: messageData, + } + writers := make([]*Writer, 100) + for _, c := range cases { + b.ResetTimer() + b.Run(c.assertion, func(b *testing.B) { + for n := 0; n < b.N; n++ { + t0 := time.Now() + for i := 0; i < len(writers); i++ { + writer, err := NewWriter(io.Discard, &WriterOptions{ + ChunkSize: int64(c.chunkSize), + Chunked: true, + }) + require.NoError(b, err) + require.NoError(b, writer.WriteHeader(&Header{ + Profile: "ros1", + Library: "foo", + })) + for j := 0; j < c.channelCount; j++ { + schema.ID = uint16(j + 1) + require.NoError(b, writer.WriteSchema(&schema)) + channel.SchemaID = uint16(j + 1) + channel.ID = uint16(j) + require.NoError(b, writer.WriteChannel(&channel)) + } + writers[i] = writer + } + channelID := 0 + messageCount := 0 + for messageCount < c.messageCount { + writerIdx := messageCount % len(writers) + message.ChannelID = uint16(channelID) + message.LogTime = uint64(messageCount) + message.PublishTime = uint64(messageCount) + require.NoError(b, writers[writerIdx].WriteMessage(&message)) + messageCount++ + channelID++ + channelID %= c.channelCount + } + for _, writer := range writers { + require.NoError(b, writer.Close()) + } + elapsed := time.Since(t0) + b.ReportMetric(float64(c.messageCount)/elapsed.Seconds(), "messages/sec") + } + }) + } +} From 73c7183ed5b3f9cecad52aa43a3c852d17b15d16 Mon Sep 17 00:00:00 2001 From: Wyatt Alt Date: Wed, 1 May 2024 19:34:36 -0700 Subject: [PATCH 02/44] Add sample bibtex citation to README (#1166) Adds a suggested bibtex citation to the README. --- README.md | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/README.md b/README.md index ba8760c7a6..b8b229a9f3 100644 --- a/README.md +++ b/README.md @@ -93,3 +93,25 @@ Tag a release matching the version number `releases/swift/vX.Y.Z` 1. Update the version in rust/Cargo.toml 2. Tag a release matching the version number `releases/rust/vX.Y.Z` + +## Citations + +If you use MCAP in your research, please cite it in your work. Our suggested +citation format is: + +``` +@software{MCAP, + title = {MCAP: serialization-agnostic log container file format}, + author = {{Foxglove Developers}}, + url = {https://mcap.dev}, + version = {your version}, + date = {your date of access}, + year = {2024}, + publisher = {{Foxglove Technologies}}, + note = {Available from https://github.com/foxglove/mcap} +} +``` + +Please replace the version and date fields with the version of the software you +used, and the date you obtained it. Citing MCAP will help spread awareness of +the project and strengthen the ecosystem. From 4c1ac3dbe3875083f24f83d7387dd9f75a073a8f Mon Sep 17 00:00:00 2001 From: Jeff Zellner Date: Wed, 8 May 2024 11:46:41 -0600 Subject: [PATCH 03/44] simplify release binary download instructions, link to latest release page (#1169) ### Changelog Remove cli download script, and link to the github latest release page ### Docs None ### Description The helper script would have trouble matching architecture and release binary depending on OS/platform. This should simplify the process, and not require tracking the version. --- .github/workflows/website.yml | 5 - website/docs/guides/cli.md | 188 ++++++++++++++++ website/docs/guides/cli.mdx | 216 ------------------- website/docusaurus.config.js | 28 --- website/src/components/CLIDownloadScript.tsx | 15 -- 5 files changed, 188 insertions(+), 264 deletions(-) create mode 100644 website/docs/guides/cli.md delete mode 100644 website/docs/guides/cli.mdx delete mode 100644 website/src/components/CLIDownloadScript.tsx diff --git a/.github/workflows/website.yml b/.github/workflows/website.yml index e732b0c377..b8852a1e1e 100644 --- a/.github/workflows/website.yml +++ b/.github/workflows/website.yml @@ -3,8 +3,6 @@ name: Website on: push: branches: [main] - # deploy the website for every new MCAP CLI release, so the version gets populated in CLI installation docs - tags: ["releases/mcap-cli/*"] pull_request: branches: ["*"] @@ -15,9 +13,6 @@ jobs: - uses: actions/checkout@v3 with: lfs: true - # https://github.com/actions/checkout/issues/701 - causes tags to be fetched, which - # are needed to determine the latest MCAP CLI release - fetch-depth: 0 - run: corepack enable - uses: actions/setup-node@v4 diff --git a/website/docs/guides/cli.md b/website/docs/guides/cli.md new file mode 100644 index 0000000000..feac70e4d2 --- /dev/null +++ b/website/docs/guides/cli.md @@ -0,0 +1,188 @@ +--- +sidebar_position: 3 +--- + +# CLI + +The MCAP command line tool is useful for working with MCAP files. + +## Installation + +### Release binaries + +Download binaries for your platform from [the latest Github release](https://github.com/foxglove/mcap/releases/latest). + +Then, mark it executable: + + $ chmod +x mcap + +If required, move the binary onto your path. + +### Homebrew + +To install using [Homebrew](https://brew.sh) on macOS or Linux, run: + + $ brew install mcap + +### From Source + +:::caution +Installing via `go install` is not supported. To build from source you must clone the repository. +::: + +1. Clone the [mcap repository](https://github.com/foxglove/mcap). +2. `$ cd go/cli/mcap` +3. `$ make build` +4. The binary will be built into the a newly created `bin` folder. + +## Usage + +Run `mcap --help` for detailed usage information. + + $ mcap --help + + Usage: + mcap [command] + + Available Commands: + add Add records to an existing MCAP file + cat Cat the messages in an MCAP file to stdout + completion Generate the autocompletion script for the specified shell + compress Create a compressed copy of an MCAP file + convert Convert a bag file to an MCAP file + decompress Create an uncompressed copy of an MCAP file + doctor Check an MCAP file structure + filter Copy some filtered MCAP data to a new file + get Get a record from an MCAP file + help Help about any command + info Report statistics about an MCAP file + list List records of an MCAP file + merge Merge a selection of MCAP files by record timestamp + recover Recover data from a potentially corrupt MCAP file + version Output version information + + Flags: + --config string Config file (default is $HOME/.mcap.yaml) + -h, --help help for mcap + -v, --verbose Verbose output + + Use "mcap [command] --help" for more information about a command. + +### ROS Bag to MCAP conversion + +Convert a ROS 1 bag file to mcap: + + + + $ mcap convert demo.bag demo.mcap + + + +Convert a ROS 2 db3 file to mcap: + + + + $ mcap convert demo.db3 demo.mcap + + + +In ROS 2 releases prior to Iron, db3 files did not contain message definitions (schemas). When converting to MCAP, you should first source the same ROS 2 workspace that the original file was recorded with. If this is not available, you will need to specify a search directory for message definitions (e.g `/opt/ros/humble` from the original system): + + $ mcap convert demo.db3 demo.mcap --ament-prefix-path /path/to/humble + +Alternatively, the [`ros2 bag convert`](https://github.com/ros2/rosbag2#converting-bags) utility may be used to convert between db3 and mcap. + +### File summarization + +Report summary statistics on an MCAP file: + + + + $ mcap info demo.mcap + library: mcap go #(devel) + profile: ros1 + messages: 1606 + duration: 7.780758504s + start: 2017-03-21T19:26:20.103843113-07:00 (1490149580.103843113) + end: 2017-03-21T19:26:27.884601617-07:00 (1490149587.884601617) + compression: + zstd: [14/14 chunks] (50.73%) + channels: + (0) /diagnostics 52 msgs (6.68 Hz) : diagnostic_msgs/DiagnosticArray [ros1msg] + (1) /image_color/compressed 234 msgs (30.07 Hz) : sensor_msgs/CompressedImage [ros1msg] + (2) /tf 774 msgs (99.48 Hz) : tf2_msgs/TFMessage [ros1msg] + (3) /radar/points 156 msgs (20.05 Hz) : sensor_msgs/PointCloud2 [ros1msg] + (4) /radar/range 156 msgs (20.05 Hz) : sensor_msgs/Range [ros1msg] + (5) /radar/tracks 156 msgs (20.05 Hz) : radar_driver/RadarTracks [ros1msg] + (6) /velodyne_points 78 msgs (10.02 Hz) : sensor_msgs/PointCloud2 [ros1msg] + attachments: 0 + + + +### Indexed reading + +Echo messages for a specific topic to stdout as JSON: + + $ mcap cat demo.mcap --topics /tf --json | head -n 10 + {"topic":"/tf","sequence":2,"log_time":1490149580.103843113,"publish_time":1490149580.103843113,"data":{"transforms":[{"header":{"seq":0,"stamp":1490149580.117017840,"frame_id":"base_link"},"child_frame_id":"radar","transform":{"translation":{"x":3.835,"y":0,"z":0},"rotation":{"x":0,"y":0,"z":0,"w":1}}}]}} + {"topic":"/tf","sequence":3,"log_time":1490149580.113944947,"publish_time":1490149580.113944947,"data":{"transforms":[{"header":{"seq":0,"stamp":1490149580.127078895,"frame_id":"base_link"},"child_frame_id":"radar","transform":{"translation":{"x":3.835,"y":0,"z":0},"rotation":{"x":0,"y":0,"z":0,"w":1}}}]}} + {"topic":"/tf","sequence":8,"log_time":1490149580.124028613,"publish_time":1490149580.124028613,"data":{"transforms":[{"header":{"seq":0,"stamp":1490149580.137141823,"frame_id":"base_link"},"child_frame_id":"radar","transform":{"translation":{"x":3.835,"y":0,"z":0},"rotation":{"x":0,"y":0,"z":0,"w":1}}}]}} + {"topic":"/tf","sequence":10,"log_time":1490149580.134219155,"publish_time":1490149580.134219155,"data":{"transforms":[{"header":{"seq":0,"stamp":1490149580.147199242,"frame_id":"base_link"},"child_frame_id":"radar","transform":{"translation":{"x":3.835,"y":0,"z":0},"rotation":{"x":0,"y":0,"z":0,"w":1}}}]}} + {"topic":"/tf","sequence":11,"log_time":1490149580.144292780,"publish_time":1490149580.144292780,"data":{"transforms":[{"header":{"seq":0,"stamp":1490149580.157286100,"frame_id":"base_link"},"child_frame_id":"radar","transform":{"translation":{"x":3.835,"y":0,"z":0},"rotation":{"x":0,"y":0,"z":0,"w":1}}}]}} + {"topic":"/tf","sequence":12,"log_time":1490149580.154895238,"publish_time":1490149580.154895238,"data":{"transforms":[{"header":{"seq":0,"stamp":1490149580.167376974,"frame_id":"base_link"},"child_frame_id":"radar","transform":{"translation":{"x":3.835,"y":0,"z":0},"rotation":{"x":0,"y":0,"z":0,"w":1}}}]}} + {"topic":"/tf","sequence":15,"log_time":1490149580.165152280,"publish_time":1490149580.165152280,"data":{"transforms":[{"header":{"seq":0,"stamp":1490149580.177463023,"frame_id":"base_link"},"child_frame_id":"radar","transform":{"translation":{"x":3.835,"y":0,"z":0},"rotation":{"x":0,"y":0,"z":0,"w":1}}}]}} + {"topic":"/tf","sequence":20,"log_time":1490149580.175192697,"publish_time":1490149580.175192697,"data":{"transforms":[{"header":{"seq":0,"stamp":1490149580.187523449,"frame_id":"base_link"},"child_frame_id":"radar","transform":{"translation":{"x":3.835,"y":0,"z":0},"rotation":{"x":0,"y":0,"z":0,"w":1}}}]}} + {"topic":"/tf","sequence":21,"log_time":1490149580.185428613,"publish_time":1490149580.185428613,"data":{"transforms":[{"header":{"seq":0,"stamp":1490149580.197612248,"frame_id":"base_link"},"child_frame_id":"radar","transform":{"translation":{"x":3.835,"y":0,"z":0},"rotation":{"x":0,"y":0,"z":0,"w":1}}}]}} + {"topic":"/tf","sequence":22,"log_time":1490149580.196638030,"publish_time":1490149580.196638030,"data":{"transforms":[{"header":{"seq":0,"stamp":1490149580.207699065,"frame_id":"base_link"},"child_frame_id":"radar","transform":{"translation":{"x":3.835,"y":0,"z":0},"rotation":{"x":0,"y":0,"z":0,"w":1}}}]}} + +### Remote file support + +All commands except `convert` support reading from remote files stored in GCS: + + + + $ mcap info gs://your-remote-bucket/demo.mcap + library: mcap go #(devel) + profile: ros1 + messages: 1606 + duration: 7.780758504s + start: 2017-03-21T19:26:20.103843113-07:00 (1490149580.103843113) + end: 2017-03-21T19:26:27.884601617-07:00 (1490149587.884601617) + compression: + zstd: [14/14 chunks] (50.73%) + channels: + (0) /diagnostics 52 msgs (6.68 Hz) : diagnostic_msgs/DiagnosticArray [ros1msg] + (1) /image_color/compressed 234 msgs (30.07 Hz) : sensor_msgs/CompressedImage [ros1msg] + (2) /tf 774 msgs (99.48 Hz) : tf2_msgs/TFMessage [ros1msg] + (3) /radar/points 156 msgs (20.05 Hz) : sensor_msgs/PointCloud2 [ros1msg] + (4) /radar/range 156 msgs (20.05 Hz) : sensor_msgs/Range [ros1msg] + (5) /radar/tracks 156 msgs (20.05 Hz) : radar_driver/RadarTracks [ros1msg] + (6) /velodyne_points 78 msgs (10.02 Hz) : sensor_msgs/PointCloud2 [ros1msg] + attachments: 0 + + + +Remote reads will use the index at the end of the file to minimize latency and data transfer. + +### File Diagnostics + +#### List chunks in a file + +The `mcap list` command can be used with chunks or attachments: + + $ mcap list chunks ~/data/mcap/demo.mcap + offset length start end compression compressed size uncompressed size compression ratio + 43 4529455 1490149580103843113 1490149580608392239 zstd 4529402 9400437 0.481829 + 4531299 4751426 1490149580618484655 1490149581212757989 zstd 4751373 9621973 0.493804 + 9284910 4726518 1490149581222848447 1490149581811286531 zstd 4726465 9617327 0.491453 + 14013453 4734289 1490149581821378989 1490149582418243031 zstd 4734236 9624850 0.491876 + 18749879 4742989 1490149582428402906 1490149583010292990 zstd 4742936 9646234 0.491688 + 23494877 4712785 1490149583020377156 1490149583617657323 zstd 4712732 9619341 0.489923 + 28209799 4662983 1490149583627720990 1490149584217852199 zstd 4662930 9533042 0.489133 + 32874919 4643191 1490149584227924615 1490149584813214116 zstd 4643138 9499481 0.488778 + 37520119 4726655 1490149584823300282 1490149585411567366 zstd 4726602 9591399 0.492796 + 42248895 4748884 1490149585421596866 1490149586021460449 zstd 4748831 9621776 0.493550 + 46999820 4746828 1490149586031607908 1490149586617282658 zstd 4746775 9632302 0.492798 + 51748769 4759213 1490149586627453408 1490149587217501700 zstd 4759160 9634744 0.493958 + 56510103 4750731 1490149587227624742 1490149587814043200 zstd 4750678 9622778 0.493691 + 61262859 217330 1490149587824113700 1490149587884601617 zstd 217277 217255 1.000101 diff --git a/website/docs/guides/cli.mdx b/website/docs/guides/cli.mdx deleted file mode 100644 index 974c1d0759..0000000000 --- a/website/docs/guides/cli.mdx +++ /dev/null @@ -1,216 +0,0 @@ ---- -sidebar_position: 3 ---- - -import CLIDownloadScript from "../../src/components/CLIDownloadScript"; - -# CLI - -The MCAP command line tool is useful for working with MCAP files. - -## Installation - -### Release binaries - -Use the asset links on https://github.com/foxglove/mcap/releases to download the latest binary for your platform: - - - -Then, mark it executable: - -```bash -$ chmod +x mcap -``` - -If required, move the binary onto your path. - -### Homebrew - -To install using [Homebrew](https://brew.sh) on macOS or Linux, run: - -```bash -$ brew install mcap -``` - -### From Source - -:::caution -Installing via `go install` is not supported. To build from source you must clone the repository. -::: - -1. Clone the [mcap repository](https://github.com/foxglove/mcap). -2. `$ cd go/cli/mcap` -3. `$ make build` -4. The binary will be built into the a newly created `bin` folder. - -## Usage - -Run `mcap --help` for detailed usage information. - -``` -$ mcap --help - -Usage: -mcap [command] - -Available Commands: -add Add records to an existing MCAP file -cat Cat the messages in an MCAP file to stdout -completion Generate the autocompletion script for the specified shell -compress Create a compressed copy of an MCAP file -convert Convert a bag file to an MCAP file -decompress Create an uncompressed copy of an MCAP file -doctor Check an MCAP file structure -filter Copy some filtered MCAP data to a new file -get Get a record from an MCAP file -help Help about any command -info Report statistics about an MCAP file -list List records of an MCAP file -merge Merge a selection of MCAP files by record timestamp -recover Recover data from a potentially corrupt MCAP file -version Output version information - -Flags: - --config string Config file (default is $HOME/.mcap.yaml) --h, --help help for mcap --v, --verbose Verbose output - -Use "mcap [command] --help" for more information about a command. -``` - -### ROS Bag to MCAP conversion - -Convert a ROS 1 bag file to mcap: - -{/* cspell: disable */} - -```bash -$ mcap convert demo.bag demo.mcap -``` - -{/* cspell: enable */} - -Convert a ROS 2 db3 file to mcap: - -{/* cspell: disable */} - -```bash -$ mcap convert demo.db3 demo.mcap -``` - -{/* cspell: enable */} - -In ROS 2 releases prior to Iron, db3 files did not contain message definitions (schemas). When converting to MCAP, you should first source the same ROS 2 workspace that the original file was recorded with. If this is not available, you will need to specify a search directory for message definitions (e.g `/opt/ros/humble` from the original system): - -```bash -$ mcap convert demo.db3 demo.mcap --ament-prefix-path /path/to/humble -``` - -Alternatively, the [`ros2 bag convert`](https://github.com/ros2/rosbag2#converting-bags) utility may be used to convert between db3 and mcap. - -### File summarization - -Report summary statistics on an MCAP file: - -{/* cspell: disable */} - -``` -$ mcap info demo.mcap -library: mcap go #(devel) -profile: ros1 -messages: 1606 -duration: 7.780758504s -start: 2017-03-21T19:26:20.103843113-07:00 (1490149580.103843113) -end: 2017-03-21T19:26:27.884601617-07:00 (1490149587.884601617) -compression: - zstd: [14/14 chunks] (50.73%) -channels: - (0) /diagnostics 52 msgs (6.68 Hz) : diagnostic_msgs/DiagnosticArray [ros1msg] - (1) /image_color/compressed 234 msgs (30.07 Hz) : sensor_msgs/CompressedImage [ros1msg] - (2) /tf 774 msgs (99.48 Hz) : tf2_msgs/TFMessage [ros1msg] - (3) /radar/points 156 msgs (20.05 Hz) : sensor_msgs/PointCloud2 [ros1msg] - (4) /radar/range 156 msgs (20.05 Hz) : sensor_msgs/Range [ros1msg] - (5) /radar/tracks 156 msgs (20.05 Hz) : radar_driver/RadarTracks [ros1msg] - (6) /velodyne_points 78 msgs (10.02 Hz) : sensor_msgs/PointCloud2 [ros1msg] -attachments: 0 -``` - -{/* cspell: enable */} - -### Indexed reading - -Echo messages for a specific topic to stdout as JSON: - -``` -$ mcap cat demo.mcap --topics /tf --json | head -n 10 -{"topic":"/tf","sequence":2,"log_time":1490149580.103843113,"publish_time":1490149580.103843113,"data":{"transforms":[{"header":{"seq":0,"stamp":1490149580.117017840,"frame_id":"base_link"},"child_frame_id":"radar","transform":{"translation":{"x":3.835,"y":0,"z":0},"rotation":{"x":0,"y":0,"z":0,"w":1}}}]}} -{"topic":"/tf","sequence":3,"log_time":1490149580.113944947,"publish_time":1490149580.113944947,"data":{"transforms":[{"header":{"seq":0,"stamp":1490149580.127078895,"frame_id":"base_link"},"child_frame_id":"radar","transform":{"translation":{"x":3.835,"y":0,"z":0},"rotation":{"x":0,"y":0,"z":0,"w":1}}}]}} -{"topic":"/tf","sequence":8,"log_time":1490149580.124028613,"publish_time":1490149580.124028613,"data":{"transforms":[{"header":{"seq":0,"stamp":1490149580.137141823,"frame_id":"base_link"},"child_frame_id":"radar","transform":{"translation":{"x":3.835,"y":0,"z":0},"rotation":{"x":0,"y":0,"z":0,"w":1}}}]}} -{"topic":"/tf","sequence":10,"log_time":1490149580.134219155,"publish_time":1490149580.134219155,"data":{"transforms":[{"header":{"seq":0,"stamp":1490149580.147199242,"frame_id":"base_link"},"child_frame_id":"radar","transform":{"translation":{"x":3.835,"y":0,"z":0},"rotation":{"x":0,"y":0,"z":0,"w":1}}}]}} -{"topic":"/tf","sequence":11,"log_time":1490149580.144292780,"publish_time":1490149580.144292780,"data":{"transforms":[{"header":{"seq":0,"stamp":1490149580.157286100,"frame_id":"base_link"},"child_frame_id":"radar","transform":{"translation":{"x":3.835,"y":0,"z":0},"rotation":{"x":0,"y":0,"z":0,"w":1}}}]}} -{"topic":"/tf","sequence":12,"log_time":1490149580.154895238,"publish_time":1490149580.154895238,"data":{"transforms":[{"header":{"seq":0,"stamp":1490149580.167376974,"frame_id":"base_link"},"child_frame_id":"radar","transform":{"translation":{"x":3.835,"y":0,"z":0},"rotation":{"x":0,"y":0,"z":0,"w":1}}}]}} -{"topic":"/tf","sequence":15,"log_time":1490149580.165152280,"publish_time":1490149580.165152280,"data":{"transforms":[{"header":{"seq":0,"stamp":1490149580.177463023,"frame_id":"base_link"},"child_frame_id":"radar","transform":{"translation":{"x":3.835,"y":0,"z":0},"rotation":{"x":0,"y":0,"z":0,"w":1}}}]}} -{"topic":"/tf","sequence":20,"log_time":1490149580.175192697,"publish_time":1490149580.175192697,"data":{"transforms":[{"header":{"seq":0,"stamp":1490149580.187523449,"frame_id":"base_link"},"child_frame_id":"radar","transform":{"translation":{"x":3.835,"y":0,"z":0},"rotation":{"x":0,"y":0,"z":0,"w":1}}}]}} -{"topic":"/tf","sequence":21,"log_time":1490149580.185428613,"publish_time":1490149580.185428613,"data":{"transforms":[{"header":{"seq":0,"stamp":1490149580.197612248,"frame_id":"base_link"},"child_frame_id":"radar","transform":{"translation":{"x":3.835,"y":0,"z":0},"rotation":{"x":0,"y":0,"z":0,"w":1}}}]}} -{"topic":"/tf","sequence":22,"log_time":1490149580.196638030,"publish_time":1490149580.196638030,"data":{"transforms":[{"header":{"seq":0,"stamp":1490149580.207699065,"frame_id":"base_link"},"child_frame_id":"radar","transform":{"translation":{"x":3.835,"y":0,"z":0},"rotation":{"x":0,"y":0,"z":0,"w":1}}}]}} -``` - -### Remote file support - -All commands except `convert` support reading from remote files stored in GCS: - -{/* cspell: disable */} - -``` -$ mcap info gs://your-remote-bucket/demo.mcap -library: mcap go #(devel) -profile: ros1 -messages: 1606 -duration: 7.780758504s -start: 2017-03-21T19:26:20.103843113-07:00 (1490149580.103843113) -end: 2017-03-21T19:26:27.884601617-07:00 (1490149587.884601617) -compression: - zstd: [14/14 chunks] (50.73%) -channels: - (0) /diagnostics 52 msgs (6.68 Hz) : diagnostic_msgs/DiagnosticArray [ros1msg] - (1) /image_color/compressed 234 msgs (30.07 Hz) : sensor_msgs/CompressedImage [ros1msg] - (2) /tf 774 msgs (99.48 Hz) : tf2_msgs/TFMessage [ros1msg] - (3) /radar/points 156 msgs (20.05 Hz) : sensor_msgs/PointCloud2 [ros1msg] - (4) /radar/range 156 msgs (20.05 Hz) : sensor_msgs/Range [ros1msg] - (5) /radar/tracks 156 msgs (20.05 Hz) : radar_driver/RadarTracks [ros1msg] - (6) /velodyne_points 78 msgs (10.02 Hz) : sensor_msgs/PointCloud2 [ros1msg] -attachments: 0 -``` - -{/* cspell: enable */} - -Remote reads will use the index at the end of the file to minimize latency and data transfer. - -### File Diagnostics - -#### List chunks in a file - -The `mcap list` command can be used with chunks or attachments: - -{/* cspell: disable */} - -``` -$ mcap list chunks ~/data/mcap/demo.mcap -offset length start end compression compressed size uncompressed size compression ratio -43 4529455 1490149580103843113 1490149580608392239 zstd 4529402 9400437 0.481829 -4531299 4751426 1490149580618484655 1490149581212757989 zstd 4751373 9621973 0.493804 -9284910 4726518 1490149581222848447 1490149581811286531 zstd 4726465 9617327 0.491453 -14013453 4734289 1490149581821378989 1490149582418243031 zstd 4734236 9624850 0.491876 -18749879 4742989 1490149582428402906 1490149583010292990 zstd 4742936 9646234 0.491688 -23494877 4712785 1490149583020377156 1490149583617657323 zstd 4712732 9619341 0.489923 -28209799 4662983 1490149583627720990 1490149584217852199 zstd 4662930 9533042 0.489133 -32874919 4643191 1490149584227924615 1490149584813214116 zstd 4643138 9499481 0.488778 -37520119 4726655 1490149584823300282 1490149585411567366 zstd 4726602 9591399 0.492796 -42248895 4748884 1490149585421596866 1490149586021460449 zstd 4748831 9621776 0.493550 -46999820 4746828 1490149586031607908 1490149586617282658 zstd 4746775 9632302 0.492798 -51748769 4759213 1490149586627453408 1490149587217501700 zstd 4759160 9634744 0.493958 -56510103 4750731 1490149587227624742 1490149587814043200 zstd 4750678 9622778 0.493691 -61262859 217330 1490149587824113700 1490149587884601617 zstd 217277 217255 1.000101 -``` - -{/* cspell: enable */} diff --git a/website/docusaurus.config.js b/website/docusaurus.config.js index afd8b2ac37..ec210baf19 100644 --- a/website/docusaurus.config.js +++ b/website/docusaurus.config.js @@ -5,9 +5,7 @@ const path = require("path"); const darkCodeTheme = require("prism-react-renderer/themes/dracula"); const lightCodeTheme = require("prism-react-renderer/themes/github"); -const util = require("util"); const webpack = require("webpack"); -const execAsync = util.promisify(require("child_process").exec); const modifySvgoConfigInPlace = require("./modifySvgoConfigInPlace"); @@ -67,32 +65,6 @@ const config = { }; }, }), - () => { - // determines the current CLI download link to display by fetching the latest tag matching - // releases/mcap-cli/* at build time. - return { - name: "latestCLIReleaseTag", - async loadContent() { - /* cspell:disable */ - const result = await execAsync( - `git tag --sort=-creatordate --list "releases/mcap-cli/*"`, - ); - /* cspell:enable */ - if (result.stdout.length === 0) { - throw new Error( - `could not determine latest MCAP CLI tag ${JSON.stringify( - result, - )}`, - ); - } - const latest = result.stdout.split("\n")[0]; - return latest; - }, - async contentLoaded({ content, actions }) { - actions.setGlobalData({ tag: content }); - }, - }; - }, ], presets: [ diff --git a/website/src/components/CLIDownloadScript.tsx b/website/src/components/CLIDownloadScript.tsx deleted file mode 100644 index b0bda8daa6..0000000000 --- a/website/src/components/CLIDownloadScript.tsx +++ /dev/null @@ -1,15 +0,0 @@ -import { usePluginData } from "@docusaurus/useGlobalData"; -import CodeBlock from "@theme/CodeBlock"; -import React from "react"; - -export default function CLIDownloadScript(): JSX.Element { - const latestVersion = usePluginData("latestCLIReleaseTag") as { tag: string }; - const tag = encodeURIComponent(latestVersion.tag); - - return ( - - $ wget https://github.com/foxglove/mcap/releases/download/{tag} - /mcap-linux-$(arch) -O mcap - - ); -} From fbcc2d31f68c25e79206c132fb4eb9dc52bc8331 Mon Sep 17 00:00:00 2001 From: Wyatt Alt Date: Sun, 12 May 2024 13:33:09 -0700 Subject: [PATCH 04/44] spec: Clarify legal contents of chunk records (#1170) Prior to this commit, the wording of the spec implied that only channel, schema, and message records are allowed in chunks. This could lead tooling implementers to validate that assertion. This is inconsistent with our support for extensible record types. Users should be able to put custom records in chunks as well, and readers that parse chunks should ignore unrecognized records rather than error on them. --- website/docs/spec/index.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/website/docs/spec/index.md b/website/docs/spec/index.md index a2b031ba80..7d49891cc3 100644 --- a/website/docs/spec/index.md +++ b/website/docs/spec/index.md @@ -181,7 +181,7 @@ The message encoding and schema must match that of the Channel record correspond ### Chunk (op=0x06) -A Chunk contains a batch of Schema, Channel, and Message records. The batch of records contained in a chunk may be compressed or uncompressed. +A Chunk contains a batch of records. Readers should expect Schema, Channel, and Message records to be present in chunks, but future spec changes or user extensions may include others. The batch of records contained in a chunk may be compressed or uncompressed. All messages in the chunk must reference channels recorded earlier in the file (in a previous chunk, earlier in the current chunk, or earlier in the data section). From 24737468344872cff47f1b5e2aa79cb31c09435a Mon Sep 17 00:00:00 2001 From: james-rms Date: Wed, 15 May 2024 09:18:25 +1000 Subject: [PATCH 05/44] go: reader: make ordered iteration faster (#1168) ### Changelog - Added `Message.PopulateFrom([]byte, bool) error` method to Message, which allows readers to avoid reallocating Message structs between messages. - Added `MessageIterator.NextInto(*Message) (*Schema, *Channel, *Message, error)` method to the MessageIterator interface. This method allows the caller to re-use memory between message reads, and to avoid allocating a new Message on the heap for every message read. - optimized indexed reading to read much faster and use much less memory. ### Docs - [x] Update docstrings - [x] Update examples ### Description This PR makes message iteration much faster. The included benchmark shows a pretty significant speedup: #### Before Note: the comparison benchmark based on `main` uses MessageIterator.Next() to read messages. ``` $ go test . -run=^$ -bench=BenchmarkReader -benchmem -benchtime=10x goos: darwin goarch: arm64 pkg: github.com/foxglove/mcap/go/mcap BenchmarkReader/inorder/no_index-8 10 532682221 ns/op 238.83 MB/s 7826155 msg/s 533306349 B/op 8054975 allocs/op BenchmarkReader/inorder/index_file_order-8 10 1886590288 ns/op 67.29 MB/s 2204813 msg/s 909884028 B/op 12114339 allocs/op BenchmarkReader/inorder/index_time_order-8 10 2248067917 ns/op 54.39 MB/s 1782145 msg/s 909889379 B/op 12114382 allocs/op BenchmarkReader/inorder/index_rev_order-8 10 2324738488 ns/op 48.90 MB/s 1602346 msg/s 910261216 B/op 12114355 allocs/op BenchmarkReader/inorder/bare_lexer-8 10 196806788 ns/op 660.42 MB/s 21640757 msg/s 17005039 B/op 4672 allocs/op BenchmarkReader/minor/no_index-8 10 509497992 ns/op 241.92 MB/s 7927082 msg/s 531637254 B/op 8054932 allocs/op BenchmarkReader/minor/index_file_order-8 10 1837735883 ns/op 66.97 MB/s 2194637 msg/s 909846889 B/op 12114373 allocs/op BenchmarkReader/minor/index_time_order-8 10 2250390946 ns/op 54.82 MB/s 1796497 msg/s 909844632 B/op 12114332 allocs/op BenchmarkReader/minor/index_rev_order-8 10 2360883250 ns/op 53.23 MB/s 1744292 msg/s 910212621 B/op 12114308 allocs/op BenchmarkReader/minor/bare_lexer-8 10 195830417 ns/op 638.20 MB/s 20912477 msg/s 15341232 B/op 4655 allocs/op BenchmarkReader/major/no_index-8 10 510946658 ns/op 241.74 MB/s 7921189 msg/s 532934768 B/op 8054945 allocs/op BenchmarkReader/major/index_file_order-8 10 1841807000 ns/op 66.35 MB/s 2174050 msg/s 909833931 B/op 12114348 allocs/op BenchmarkReader/major/index_time_order-8 10 2247866758 ns/op 54.53 MB/s 1786987 msg/s 909836941 B/op 12114379 allocs/op BenchmarkReader/major/index_rev_order-8 10 2328824133 ns/op 51.12 MB/s 1675101 msg/s 910215724 B/op 12114370 allocs/op BenchmarkReader/major/bare_lexer-8 10 198086167 ns/op 632.44 MB/s 20724011 msg/s 16635893 B/op 4661 allocs/op PASS ok github.com/foxglove/mcap/go/mcap 248.060s ``` #### After ``` % go test . -run=^$ -bench=BenchmarkReader -benchmem -benchtime=10x goos: darwin goarch: arm64 pkg: github.com/foxglove/mcap/go/mcap BenchmarkReader/inorder/no_index-8 10 209814421 ns/op 596.62 MB/s 19550071 msg/s 17491784 B/op 6310 allocs/op BenchmarkReader/inorder/index_file_order-8 10 340508775 ns/op 360.08 MB/s 11799088 msg/s 10446040 B/op 16981 allocs/op BenchmarkReader/inorder/index_time_order-8 10 341343088 ns/op 359.00 MB/s 11763672 msg/s 10443932 B/op 16955 allocs/op BenchmarkReader/inorder/index_rev_order-8 10 348526088 ns/op 356.74 MB/s 11689775 msg/s 9996309 B/op 16964 allocs/op BenchmarkReader/inorder/bare_lexer-8 10 187405846 ns/op 664.57 MB/s 21776806 msg/s 17439823 B/op 4674 allocs/op BenchmarkReader/minor/no_index-8 10 211110267 ns/op 587.06 MB/s 19236916 msg/s 16522652 B/op 6284 allocs/op BenchmarkReader/minor/index_file_order-8 10 356903517 ns/op 336.77 MB/s 11035283 msg/s 10419253 B/op 17006 allocs/op BenchmarkReader/minor/index_time_order-8 10 552369746 ns/op 218.44 MB/s 7157955 msg/s 10444996 B/op 17744 allocs/op BenchmarkReader/minor/index_rev_order-8 10 555191658 ns/op 220.27 MB/s 7217936 msg/s 9971279 B/op 17665 allocs/op BenchmarkReader/minor/bare_lexer-8 10 194812112 ns/op 554.57 MB/s 18172347 msg/s 16473271 B/op 4670 allocs/op BenchmarkReader/major/no_index-8 10 211406192 ns/op 579.88 MB/s 19001450 msg/s 17365727 B/op 6291 allocs/op BenchmarkReader/major/index_file_order-8 10 354124750 ns/op 347.12 MB/s 11374355 msg/s 10418725 B/op 16979 allocs/op BenchmarkReader/major/index_time_order-8 10 566783688 ns/op 215.38 MB/s 7057431 msg/s 16452847 B/op 17690 allocs/op BenchmarkReader/major/index_rev_order-8 10 563155871 ns/op 218.15 MB/s 7148236 msg/s 15986112 B/op 17699 allocs/op BenchmarkReader/major/bare_lexer-8 10 195610721 ns/op 633.25 MB/s 20750327 msg/s 17316992 B/op 4672 allocs/op PASS ok github.com/foxglove/mcap/go/mcap 68.716s ``` For the unindexed message iterator, all of the difference comes from: - being able to re-use a Message struct between calls to Next() - switching from storing channels and schemas in maps to slices. - using and re-using an internal buffer for lexing MCAP records. For the indexed message iterator, we do all of the same things plus: - We maintain a pool of decompressed chunk buffers, which are re-used after all of the messages from a given chunk are read out. - We no longer read message indexes from the file, choosing instead to read the chunk content and build our own message index in memory. This allows us to read files written without message indexes in order, and also reduces I/O, which in some cases is *probably faster* (slow network connections with large message index overheads) - we no longer use a heap to maintain order, instead we maintain a sorted array of unread message indexes. Every time we encounter a new chunk: - if this chunk does not overlap with the last, clear the message index array and write the new chunk's messages in. If they are already in order, do not sort. This makes ordered iteration over an in-order MCAP as fast as unordered iteration. - if the new chunk's messages are not in order, sort the new chunk's messages. - if the new chunk overlaps with the last, append the new chunk's messages, and sort all unread messages. #### New API justification The issues with `Next(p []byte) (*Schema, *Channel, *Message, error)` that caused me to explore alternatives are: - The buffer `p` is used to store the message data, but if it isn't big enough, a new buffer is allocated for every message. A common idiom is to return the newly allocated buffer so that the caller can take ownership of it and re-use it, but the current API doesn't allow that. - A new Message struct is allocated on the heap for every iteration. Even if the message goes out of scope on the next loop, this still causes significant work for the garbage collector to do. This new function signature re-uses the message passed in, if one is passed in. If `nil` is used, it creates a new Message. --- .vscode/settings.json | 5 +- go/.golangci.yaml | 1 - go/cli/mcap/cmd/cat.go | 4 +- go/cli/mcap/cmd/merge.go | 4 +- go/cli/mcap/cmd/sort.go | 7 +- go/cli/mcap/cmd/sort_test.go | 2 +- go/conformance/test-read-conformance/main.go | 2 +- go/mcap/indexed_message_iterator.go | 338 ++++++++++++++----- go/mcap/lexer.go | 2 +- go/mcap/parse.go | 38 ++- go/mcap/range_index_heap.go | 107 ------ go/mcap/range_index_heap_test.go | 89 ----- go/mcap/reader.go | 17 +- go/mcap/reader_test.go | 281 ++++++++++++++- go/mcap/slicemap.go | 44 +++ go/mcap/slicemap_test.go | 33 ++ go/mcap/unindexed_message_iterator.go | 45 ++- go/mcap/version.go | 2 +- go/ros/ros2db3_to_mcap_test.go | 2 +- 19 files changed, 671 insertions(+), 352 deletions(-) delete mode 100644 go/mcap/range_index_heap.go delete mode 100644 go/mcap/range_index_heap_test.go create mode 100644 go/mcap/slicemap.go create mode 100644 go/mcap/slicemap_test.go diff --git a/.vscode/settings.json b/.vscode/settings.json index 92e718ef0d..d65b6a426c 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -44,5 +44,8 @@ // https://github.com/microsoft/vscode-cpptools/issues/722 "C_Cpp.autoAddFileAssociations": false, - "C_Cpp.default.cppStandard": "c++17" + "C_Cpp.default.cppStandard": "c++17", + "[go]": { + "editor.defaultFormatter": "golang.go" + } } diff --git a/go/.golangci.yaml b/go/.golangci.yaml index d79ccf5330..4371be5156 100644 --- a/go/.golangci.yaml +++ b/go/.golangci.yaml @@ -10,7 +10,6 @@ linters: - govet - ineffassign - staticcheck - - structcheck - typecheck - unused - varcheck diff --git a/go/cli/mcap/cmd/cat.go b/go/cli/mcap/cmd/cat.go index 5cc687d028..97f0d3ad12 100644 --- a/go/cli/mcap/cmd/cat.go +++ b/go/cli/mcap/cmd/cat.go @@ -201,12 +201,12 @@ func printMessages( ) error { msg := &bytes.Buffer{} msgReader := &bytes.Reader{} - buf := make([]byte, 1024*1024) + message := mcap.Message{Data: make([]byte, 0, 1024*1024)} transcoders := make(map[uint16]*ros1msg.JSONTranscoder) descriptors := make(map[uint16]protoreflect.MessageDescriptor) jsonWriter := newJSONOutputWriter(w) for { - schema, channel, message, err := it.Next(buf) + schema, channel, _, err := it.NextInto(&message) if err != nil { if errors.Is(err, io.EOF) { break diff --git a/go/cli/mcap/cmd/merge.go b/go/cli/mcap/cmd/merge.go index 5142ecd6f9..2c3bf2ef28 100644 --- a/go/cli/mcap/cmd/merge.go +++ b/go/cli/mcap/cmd/merge.go @@ -300,7 +300,7 @@ func (m *mcapMerger) mergeInputs(w io.Writer, inputs []namedReader) error { } for inputID, iterator := range iterators { inputName := inputs[inputID].name - schema, channel, message, err := iterator.Next(nil) + schema, channel, message, err := iterator.NextInto(nil) if err != nil { if errors.Is(err, io.EOF) { // the file may be an empty mcap. if so, just ignore it. @@ -336,7 +336,7 @@ func (m *mcapMerger) mergeInputs(w io.Writer, inputs []namedReader) error { // Pull the next message off the iterator, to replace the one just // popped from the queue. Before pushing this message, it must be // renumbered and the related channels/schemas may need to be inserted. - newSchema, newChannel, newMessage, err := iterators[msg.InputID].Next(nil) + newSchema, newChannel, newMessage, err := iterators[msg.InputID].NextInto(nil) if err != nil { if errors.Is(err, io.EOF) { // if the iterator is empty, skip this read. No further messages diff --git a/go/cli/mcap/cmd/sort.go b/go/cli/mcap/cmd/sort.go index 5687abb939..e14df3a2bc 100644 --- a/go/cli/mcap/cmd/sort.go +++ b/go/cli/mcap/cmd/sort.go @@ -45,7 +45,7 @@ func fileHasNoMessages(r io.ReadSeeker) (bool, error) { if err != nil { return false, err } - _, _, _, err = it.Next(nil) + _, _, _, err = it.NextInto(nil) if err != nil { if errors.Is(err, io.EOF) { return true, nil @@ -124,8 +124,9 @@ func sortFile(w io.Writer, r io.ReadSeeker) error { } schemas := make(map[uint16]*mcap.Schema) channels := make(map[uint16]*mcap.Schema) + message := mcap.Message{} for { - schema, channel, message, err := it.Next(nil) + schema, channel, _, err := it.NextInto(&message) if err != nil { if errors.Is(err, io.EOF) { break @@ -145,7 +146,7 @@ func sortFile(w io.Writer, r io.ReadSeeker) error { return fmt.Errorf("failed to write channel: %w", err) } } - err = writer.WriteMessage(message) + err = writer.WriteMessage(&message) if err != nil { return fmt.Errorf("failed to write message: %w", err) } diff --git a/go/cli/mcap/cmd/sort_test.go b/go/cli/mcap/cmd/sort_test.go index cb3e750397..ee7d0e5c03 100644 --- a/go/cli/mcap/cmd/sort_test.go +++ b/go/cli/mcap/cmd/sort_test.go @@ -69,7 +69,7 @@ func TestSortFile(t *testing.T) { it, err := r.Messages(mcap.UsingIndex(false)) require.NoError(t, err) - _, _, msg, err := it.Next(nil) + _, _, msg, err := it.NextInto(nil) require.NoError(t, err) assert.Equal(t, 25, int(msg.LogTime)) } diff --git a/go/conformance/test-read-conformance/main.go b/go/conformance/test-read-conformance/main.go index 9bb44ccf02..fcbfb77360 100644 --- a/go/conformance/test-read-conformance/main.go +++ b/go/conformance/test-read-conformance/main.go @@ -354,7 +354,7 @@ func readIndexed(w io.Writer, filepath string) error { knownChannelIDs := make(map[uint16]bool) for { - schema, channel, message, err := it.Next(nil) + schema, channel, message, err := it.NextInto(nil) if errors.Is(err, io.EOF) { break } diff --git a/go/mcap/indexed_message_iterator.go b/go/mcap/indexed_message_iterator.go index f6dcc11746..d351326e13 100644 --- a/go/mcap/indexed_message_iterator.go +++ b/go/mcap/indexed_message_iterator.go @@ -6,6 +6,9 @@ import ( "encoding/binary" "fmt" "io" + "math/bits" + "slices" + "sort" "github.com/klauspost/compress/zstd" "github.com/pierrec/lz4/v4" @@ -15,32 +18,54 @@ const ( chunkBufferGrowthMultiple = 1.2 ) -// indexedMessageIterator is an iterator over an indexed mcap read seeker (as -// seeking is required). It makes reads in alternation from the index data -// section, the message index at the end of a chunk, and the chunk's contents. +type chunkSlot struct { + buf []byte + unreadMessages uint64 +} + +type messageIndexWithChunkSlot struct { + timestamp uint64 + offset uint64 + chunkSlotIndex int +} + +// indexedMessageIterator is an iterator over an indexed mcap io.ReadSeeker (as +// seeking is required). It reads index information from the MCAP summary section first, then +// seeks to chunk records in the data section. +// +// This iterator reads in order by maintaining two ordered queues, one for chunk indexes and one +// for message indexes. On every call to NextInto(), the front element of both queues is checked and +// the earlier is used. When a chunk index is first, the chunk is decompressed, indexed, the +// new message indexes are pushed onto the message index queue and sorted. +// When a message index is first, that message is copied out of the decompressed chunk and yielded +// to the caller. type indexedMessageIterator struct { lexer *Lexer rs io.ReadSeeker topics map[string]bool start uint64 end uint64 + order ReadOrder - channels map[uint16]*Channel - schemas map[uint16]*Schema + channels slicemap[Channel] + schemas slicemap[Schema] statistics *Statistics chunkIndexes []*ChunkIndex attachmentIndexes []*AttachmentIndex metadataIndexes []*MetadataIndex footer *Footer - indexHeap rangeIndexHeap + curChunkIndex int + messageIndexes []messageIndexWithChunkSlot + curMessageIndex int + chunkSlots []chunkSlot zstdDecoder *zstd.Decoder lz4Reader *lz4.Reader hasReadSummarySection bool - compressedChunkAndMessageIndex []byte - metadataCallback func(*Metadata) error + recordBuf []byte + metadataCallback func(*Metadata) error } // parseIndexSection parses the index section of the file and populates the @@ -95,14 +120,14 @@ func (it *indexedMessageIterator) parseSummarySection() error { if err != nil { return fmt.Errorf("failed to parse schema: %w", err) } - it.schemas[schema.ID] = schema + it.schemas.Set(schema.ID, schema) case TokenChannel: channelInfo, err := ParseChannel(record) if err != nil { return fmt.Errorf("failed to parse channel info: %w", err) } if len(it.topics) == 0 || it.topics[channelInfo.Topic] { - it.channels[channelInfo.ID] = channelInfo + it.channels.Set(channelInfo.ID, channelInfo) } case TokenAttachmentIndex: idx, err := ParseAttachmentIndex(record) @@ -121,17 +146,11 @@ func (it *indexedMessageIterator) parseSummarySection() error { if err != nil { return fmt.Errorf("failed to parse attachment index: %w", err) } - it.chunkIndexes = append(it.chunkIndexes, idx) // if the chunk overlaps with the requested parameters, load it - for _, channel := range it.channels { - if idx.MessageIndexOffsets[channel.ID] > 0 { + for _, channel := range it.channels.Slice() { + if channel != nil && idx.MessageIndexOffsets[channel.ID] > 0 { if (it.end == 0 && it.start == 0) || (idx.MessageStartTime < it.end && idx.MessageEndTime >= it.start) { - rangeIndex := rangeIndex{ - chunkIndex: idx, - } - if err := it.indexHeap.HeapPush(rangeIndex); err != nil { - return err - } + it.chunkIndexes = append(it.chunkIndexes, idx) } break } @@ -143,36 +162,79 @@ func (it *indexedMessageIterator) parseSummarySection() error { } it.statistics = stats case TokenFooter: + // sort chunk indexes in the order that they will need to be loaded, depending on the specified + // read order. + switch it.order { + case FileOrder: + sort.Slice(it.chunkIndexes, func(i, j int) bool { + return it.chunkIndexes[i].ChunkStartOffset < it.chunkIndexes[j].ChunkStartOffset + }) + case LogTimeOrder: + sort.Slice(it.chunkIndexes, func(i, j int) bool { + if it.chunkIndexes[i].MessageStartTime == it.chunkIndexes[j].MessageStartTime { + return it.chunkIndexes[i].ChunkStartOffset < it.chunkIndexes[j].ChunkStartOffset + } + return it.chunkIndexes[i].MessageStartTime < it.chunkIndexes[j].MessageStartTime + }) + case ReverseLogTimeOrder: + sort.Slice(it.chunkIndexes, func(i, j int) bool { + if it.chunkIndexes[i].MessageEndTime == it.chunkIndexes[j].MessageEndTime { + return it.chunkIndexes[i].ChunkStartOffset > it.chunkIndexes[j].ChunkStartOffset + } + return it.chunkIndexes[i].MessageEndTime > it.chunkIndexes[j].MessageEndTime + }) + } it.hasReadSummarySection = true return nil } } } +// loadChunk seeks to and decompresses a chunk into a chunk slot, then populates it.messageIndexes +// with the offsets of messages in that chunk. func (it *indexedMessageIterator) loadChunk(chunkIndex *ChunkIndex) error { _, err := it.rs.Seek(int64(chunkIndex.ChunkStartOffset), io.SeekStart) if err != nil { return err } - compressedChunkLength := chunkIndex.ChunkLength + chunkIndex.MessageIndexLength - if len(it.compressedChunkAndMessageIndex) < int(compressedChunkLength) { + compressedChunkLength := chunkIndex.ChunkLength + if uint64(cap(it.recordBuf)) < compressedChunkLength { newSize := int(float64(compressedChunkLength) * chunkBufferGrowthMultiple) - it.compressedChunkAndMessageIndex = make([]byte, newSize) + it.recordBuf = make([]byte, newSize) + } else { + it.recordBuf = it.recordBuf[:compressedChunkLength] } - _, err = io.ReadFull(it.rs, it.compressedChunkAndMessageIndex[:compressedChunkLength]) + _, err = io.ReadFull(it.rs, it.recordBuf) if err != nil { return fmt.Errorf("failed to read chunk data: %w", err) } - parsedChunk, err := ParseChunk(it.compressedChunkAndMessageIndex[9:chunkIndex.ChunkLength]) + parsedChunk, err := ParseChunk(it.recordBuf[9:]) if err != nil { return fmt.Errorf("failed to parse chunk: %w", err) } // decompress the chunk data - var chunkData []byte + chunkSlotIndex := -1 + for i, chunkSlot := range it.chunkSlots { + if chunkSlot.unreadMessages == 0 { + chunkSlotIndex = i + break + } + } + if chunkSlotIndex == -1 { + it.chunkSlots = append(it.chunkSlots, chunkSlot{}) + chunkSlotIndex = len(it.chunkSlots) - 1 + } + chunkSlot := &it.chunkSlots[chunkSlotIndex] + bufSize := parsedChunk.UncompressedSize + if uint64(cap(chunkSlot.buf)) < bufSize { + chunkSlot.buf = make([]byte, bufSize) + } else { + chunkSlot.buf = chunkSlot.buf[:bufSize] + } switch CompressionFormat(parsedChunk.Compression) { case CompressionNone: - chunkData = parsedChunk.Records + copy(chunkSlot.buf, parsedChunk.Records) case CompressionZSTD: if it.zstdDecoder == nil { it.zstdDecoder, err = zstd.NewReader(nil) @@ -180,8 +242,7 @@ func (it *indexedMessageIterator) loadChunk(chunkIndex *ChunkIndex) error { return fmt.Errorf("failed to instantiate zstd decoder: %w", err) } } - chunkData = make([]byte, 0, parsedChunk.UncompressedSize) - chunkData, err = it.zstdDecoder.DecodeAll(parsedChunk.Records, chunkData) + chunkSlot.buf, err = it.zstdDecoder.DecodeAll(parsedChunk.Records, chunkSlot.buf[:0]) if err != nil { return fmt.Errorf("failed to decode chunk data: %w", err) } @@ -191,83 +252,144 @@ func (it *indexedMessageIterator) loadChunk(chunkIndex *ChunkIndex) error { } else { it.lz4Reader.Reset(bytes.NewReader(parsedChunk.Records)) } - chunkData = make([]byte, parsedChunk.UncompressedSize) - _, err = io.ReadFull(it.lz4Reader, chunkData) + _, err = io.ReadFull(it.lz4Reader, chunkSlot.buf) if err != nil { return fmt.Errorf("failed to decompress lz4 chunk: %w", err) } default: return fmt.Errorf("unsupported compression %s", parsedChunk.Compression) } - // use the message index to find the messages we want from the chunk - messageIndexSection := it.compressedChunkAndMessageIndex[chunkIndex.ChunkLength:compressedChunkLength] - var recordLen uint64 - offset := 0 - for offset < len(messageIndexSection) { - if op := OpCode(messageIndexSection[offset]); op != OpMessageIndex { - return fmt.Errorf("unexpected token %s in message index section", op) - } - offset++ - recordLen, offset, err = getUint64(messageIndexSection, offset) - if err != nil { - return fmt.Errorf("failed to get message index record length: %w", err) + // produce message indexes for the newly decompressed chunk data. + var maxLogTime uint64 + // if there are no message indexes outstanding, truncate now. + if it.curMessageIndex == len(it.messageIndexes) { + it.curMessageIndex = 0 + it.messageIndexes = it.messageIndexes[:0] + } + sortingRequired := it.curMessageIndex != 0 + startIdx := len(it.messageIndexes) + for offset := uint64(0); offset < bufSize; { + if bufSize < offset+1+8 { + return fmt.Errorf("expected another record in chunk, but left with %d bytes", bufSize-offset) } - messageIndex, err := ParseMessageIndex(messageIndexSection[offset : uint64(offset)+recordLen]) - if err != nil { - return fmt.Errorf("failed to parse message index: %w", err) + opcodeAndLengthBuf := chunkSlot.buf[offset : offset+1+8] + op := OpCode(opcodeAndLengthBuf[0]) + recordLen := binary.LittleEndian.Uint64(opcodeAndLengthBuf[1:]) + recordStart := offset + 1 + 8 + recordEnd, overflow := checkedAdd(recordStart, recordLen) + if overflow { + return fmt.Errorf("record length extends past uint64 range: start: %d, len: %d", recordStart, recordLen) } - offset += int(recordLen) - // skip message indexes for channels we don't need - if _, ok := it.channels[messageIndex.ChannelID]; !ok { - continue + if bufSize < recordEnd { + return fmt.Errorf( + "%s record in chunk has length %d bytes but only %d remaining in chunk", + op, recordLen, bufSize-recordStart) } - // push any message index entries in the requested time range to the heap to read. - for i := range messageIndex.Records { - timestamp := messageIndex.Records[i].Timestamp - if timestamp >= it.start && timestamp < it.end { - if err := it.indexHeap.HeapPush(rangeIndex{ - chunkIndex: chunkIndex, - messageIndexEntry: &messageIndex.Records[i], - buf: chunkData, - }); err != nil { - return err + recordContent := chunkSlot.buf[recordStart:recordEnd] + if op == OpMessage { + msg := Message{} + if err := msg.PopulateFrom(recordContent, false); err != nil { + return fmt.Errorf("could not parse message in chunk: %w", err) + } + if it.channels.Get(msg.ChannelID) != nil { + if msg.LogTime >= it.start && msg.LogTime < it.end { + it.messageIndexes = append(it.messageIndexes, messageIndexWithChunkSlot{ + timestamp: msg.LogTime, + offset: offset, + chunkSlotIndex: chunkSlotIndex, + }) + if msg.LogTime < maxLogTime { + sortingRequired = true + } else { + maxLogTime = msg.LogTime + } + chunkSlot.unreadMessages++ } } } + offset = recordEnd } + unreadMessageIndexes := it.messageIndexes[it.curMessageIndex:] + switch it.order { + case FileOrder: + // message indexes are already in file order, no sorting needed + case LogTimeOrder: + if sortingRequired { + // We stable-sort to ensure that if messages in different chunks have the + // same timestamp, the one from the earlier-loaded chunk is returned first. The offset + // field of the message index is not comparable between indexes of different chunks. + sort.SliceStable(unreadMessageIndexes, func(i, j int) bool { + return unreadMessageIndexes[i].timestamp < unreadMessageIndexes[j].timestamp + }) + } + case ReverseLogTimeOrder: + // assume message indexes will always be mostly-in-order, so reversing the newly-added + // indexes will put them mostly into reverse order, which speeds up sorting. + // If the chunk is in order, no sorting is needed after reversing. + slices.Reverse(it.messageIndexes[startIdx:]) + if sortingRequired { + sort.SliceStable(unreadMessageIndexes, func(i, j int) bool { + return unreadMessageIndexes[i].timestamp > unreadMessageIndexes[j].timestamp + }) + } + } + // if there is more dead space at the front than there is live, remove the dead space by + // copying the live data to the front and truncating. + if len(unreadMessageIndexes) < it.curMessageIndex { + copy(it.messageIndexes[:len(unreadMessageIndexes)], unreadMessageIndexes) + it.messageIndexes = it.messageIndexes[:len(unreadMessageIndexes)] + it.curMessageIndex = 0 + } + return nil } -func readRecord(r io.Reader) (OpCode, []byte, error) { - buf := make([]byte, 9) +func readRecord(r io.Reader, buf []byte) (OpCode, []byte, error) { + if cap(buf) < 9 { + buf = make([]byte, 9) + } else { + buf = buf[:9] + } _, err := io.ReadFull(r, buf) if err != nil { return 0, nil, fmt.Errorf("failed to read record header: %w", err) } opcode := OpCode(buf[0]) recordLen := binary.LittleEndian.Uint64(buf[1:]) - record := make([]byte, recordLen) - _, err = io.ReadFull(r, record) + if uint64(cap(buf)) < recordLen { + buf = make([]byte, recordLen) + } else { + buf = buf[:recordLen] + } + _, err = io.ReadFull(r, buf) if err != nil { return 0, nil, fmt.Errorf("failed to read record: %w", err) } - return opcode, record, nil + return opcode, buf, nil } -func (it *indexedMessageIterator) Next(_ []byte) (*Schema, *Channel, *Message, error) { +// NextInto yields the next message from the iterator, writing the result into the provided Message +// struct. The msg.Data buffer will be reused if it has enough capacity. If `msg` is nil, a new +// Message will be allocated. +func (it *indexedMessageIterator) NextInto(msg *Message) (*Schema, *Channel, *Message, error) { + if msg == nil { + msg = &Message{} + } if !it.hasReadSummarySection { - err := it.parseSummarySection() - if err != nil { + if err := it.parseSummarySection(); err != nil { return nil, nil, nil, err } // take care of the metadata here if it.metadataCallback != nil { for _, idx := range it.metadataIndexes { - _, err = it.rs.Seek(int64(idx.Offset), io.SeekStart) + _, err := it.rs.Seek(int64(idx.Offset), io.SeekStart) if err != nil { return nil, nil, nil, fmt.Errorf("failed to seek to metadata: %w", err) } - opcode, data, err := readRecord(it.rs) + opcode, data, err := readRecord(it.rs, it.recordBuf) + if cap(data) > cap(it.recordBuf) { + it.recordBuf = data + } if err != nil { return nil, nil, nil, fmt.Errorf("failed to read metadata record: %w", err) } @@ -285,29 +407,71 @@ func (it *indexedMessageIterator) Next(_ []byte) (*Schema, *Channel, *Message, e } } } - - for it.indexHeap.Len() > 0 { - ri, err := it.indexHeap.HeapPop() - if err != nil { - return nil, nil, nil, err - } - if ri.messageIndexEntry == nil { - err := it.loadChunk(ri.chunkIndex) - if err != nil { + for { + // if there are no indexed messages to yield, load a chunk + if it.curMessageIndex >= len(it.messageIndexes) { + // if there are no more chunks, iteration ends + if it.curChunkIndex >= len(it.chunkIndexes) { + return nil, nil, nil, io.EOF + } + chunkIndex := it.chunkIndexes[it.curChunkIndex] + if err := it.loadChunk(chunkIndex); err != nil { return nil, nil, nil, err } + it.curChunkIndex++ continue } - chunkOffset := ri.messageIndexEntry.Offset - length := binary.LittleEndian.Uint64(ri.buf[chunkOffset+1:]) - messageData := ri.buf[chunkOffset+1+8 : chunkOffset+1+8+length] - message, err := ParseMessage(messageData) - if err != nil { + // if there are more chunks left, check if the next one should be loaded before yielding another + // message + if it.curChunkIndex < len(it.chunkIndexes) { + chunkIndex := it.chunkIndexes[it.curChunkIndex] + messageIndex := it.messageIndexes[it.curMessageIndex] + if (it.order == LogTimeOrder && chunkIndex.MessageStartTime < messageIndex.timestamp) || + (it.order == ReverseLogTimeOrder && chunkIndex.MessageEndTime > messageIndex.timestamp) { + if err := it.loadChunk(chunkIndex); err != nil { + return nil, nil, nil, err + } + it.curChunkIndex++ + continue + } + } + // yield the next message + messageIndex := it.messageIndexes[it.curMessageIndex] + chunkSlot := &it.chunkSlots[messageIndex.chunkSlotIndex] + messageDataStart, overflow := checkedAdd(messageIndex.offset, 1+8) + if overflow { + return nil, nil, nil, fmt.Errorf("message offset in chunk too close to uint64 max: %d", messageIndex.offset) + } + length := binary.LittleEndian.Uint64(chunkSlot.buf[messageIndex.offset+1:]) + messageDataEnd, overflow := checkedAdd(messageDataStart, length) + if overflow { + return nil, nil, nil, fmt.Errorf("message record length extends past uint64 range: %d", length) + } + messageData := chunkSlot.buf[messageDataStart:messageDataEnd] + if err := msg.PopulateFrom(messageData, true); err != nil { return nil, nil, nil, err } - channel := it.channels[message.ChannelID] - schema := it.schemas[channel.SchemaID] - return schema, channel, message, nil + chunkSlot.unreadMessages-- + it.curMessageIndex++ + channel := it.channels.Get(msg.ChannelID) + if channel == nil { + return nil, nil, nil, fmt.Errorf("message with unrecognized channel ID %d", msg.ChannelID) + } + schema := it.schemas.Get(channel.SchemaID) + if schema == nil && channel.SchemaID != 0 { + return nil, nil, nil, fmt.Errorf("channel %d with unrecognized schema ID %d", msg.ChannelID, channel.SchemaID) + } + return schema, channel, msg, nil } - return nil, nil, nil, io.EOF +} + +func (it *indexedMessageIterator) Next(buf []byte) (*Schema, *Channel, *Message, error) { + msg := &Message{Data: buf} + return it.NextInto(msg) +} + +// returns the sum of two uint64s, with a boolean indicating if the sum overflowed. +func checkedAdd(a, b uint64) (uint64, bool) { + res, carry := bits.Add64(a, b, 0) + return res, carry != 0 } diff --git a/go/mcap/lexer.go b/go/mcap/lexer.go index fe10c665fa..9c245194ce 100644 --- a/go/mcap/lexer.go +++ b/go/mcap/lexer.go @@ -265,7 +265,7 @@ func (l *Lexer) Next(p []byte) (TokenType, []byte, error) { continue } - if recordLen > uint64(len(p)) { + if recordLen > uint64(cap(p)) { p, err = makeSafe(recordLen) if err != nil { return TokenError, nil, fmt.Errorf("failed to allocate %d bytes for %s token: %w", recordLen, opcode, err) diff --git a/go/mcap/parse.go b/go/mcap/parse.go index 992b35eaf7..0619db6648 100644 --- a/go/mcap/parse.go +++ b/go/mcap/parse.go @@ -99,32 +99,44 @@ func ParseChannel(buf []byte) (*Channel, error) { }, nil } -// ParseMessage parses a message record. -func ParseMessage(buf []byte) (*Message, error) { +// PopulateFrom populates the fields of a Message struct from the message data slice. +func (m *Message) PopulateFrom(buf []byte, copyData bool) error { channelID, offset, err := getUint16(buf, 0) if err != nil { - return nil, fmt.Errorf("failed to read channel ID: %w", err) + return fmt.Errorf("failed to read channel ID: %w", err) } sequence, offset, err := getUint32(buf, offset) if err != nil { - return nil, fmt.Errorf("failed to read sequence: %w", err) + return fmt.Errorf("failed to read sequence: %w", err) } logTime, offset, err := getUint64(buf, offset) if err != nil { - return nil, fmt.Errorf("failed to read record time: %w", err) + return fmt.Errorf("failed to read record time: %w", err) } publishTime, offset, err := getUint64(buf, offset) if err != nil { - return nil, fmt.Errorf("failed to read publish time: %w", err) + return fmt.Errorf("failed to read publish time: %w", err) } data := buf[offset:] - return &Message{ - ChannelID: channelID, - Sequence: sequence, - LogTime: logTime, - PublishTime: publishTime, - Data: data, - }, nil + m.ChannelID = channelID + m.Sequence = sequence + m.LogTime = logTime + m.PublishTime = publishTime + if copyData { + m.Data = append(m.Data[:0], data...) + } else { + m.Data = data + } + return nil +} + +// ParseMessage parses a message record. +func ParseMessage(buf []byte) (*Message, error) { + msg := &Message{} + if err := msg.PopulateFrom(buf, false); err != nil { + return nil, err + } + return msg, nil } // ParseChunk parses a chunk record. diff --git a/go/mcap/range_index_heap.go b/go/mcap/range_index_heap.go deleted file mode 100644 index 284ecaf084..0000000000 --- a/go/mcap/range_index_heap.go +++ /dev/null @@ -1,107 +0,0 @@ -package mcap - -import ( - "container/heap" - "fmt" -) - -// rangeIndex refers to either a chunk (via the ChunkIndex, with other fields nil) -// or to a message in a chunk, in which case all fields are set. -type rangeIndex struct { - chunkIndex *ChunkIndex - messageIndexEntry *MessageIndexEntry - buf []uint8 // if messageIndexEntry is not nil, `buf` should point to the underlying chunk. -} - -// heap of rangeIndex entries, where the entries are sorted by their log time. -type rangeIndexHeap struct { - indices []rangeIndex - order ReadOrder - lastErr error -} - -// key returns the comparison key used for elements in this heap. -func (h rangeIndexHeap) timestamp(i int) uint64 { - ri := h.indices[i] - if ri.messageIndexEntry == nil { - if h.order == ReverseLogTimeOrder { - return ri.chunkIndex.MessageEndTime - } - return ri.chunkIndex.MessageStartTime - } - return ri.messageIndexEntry.Timestamp -} - -func (h *rangeIndexHeap) filePositionLess(i, j int) bool { - a := h.indices[i] - b := h.indices[j] - - // if comparing two chunks, whichever chunk comes earlier wins. - // if comparing messages in two different chunks, the message in the earlier chunk wins. - // if comparing a message in one chunk to another chunk, whichever chunk is earlier wins. - if a.chunkIndex.ChunkStartOffset != b.chunkIndex.ChunkStartOffset { - return a.chunkIndex.ChunkStartOffset < b.chunkIndex.ChunkStartOffset - } - // If comparing two messages in the same chunk, the earlier message in the chunk wins. - if a.messageIndexEntry != nil && b.messageIndexEntry != nil { - return a.messageIndexEntry.Offset < b.messageIndexEntry.Offset - } - // If we came this far, we're comparing a message in a chunk against the same chunk! - // this is a problem, because when the chunk reaches the top of the heap it will be expanded, - // and the same message will be pushed into the heap twice. - h.lastErr = fmt.Errorf("detected duplicate data: a: %v, b: %v", a, b) - return false -} - -// Required for sort.Interface. -func (h rangeIndexHeap) Len() int { return len(h.indices) } -func (h rangeIndexHeap) Swap(i, j int) { h.indices[i], h.indices[j] = h.indices[j], h.indices[i] } - -// Push is required by `heap.Interface`. Note that this is not the same as `heap.Push`! -// expected behavior by `heap` is: "add x as element len()". -func (h *rangeIndexHeap) Push(x interface{}) { - h.indices = append(h.indices, x.(rangeIndex)) -} - -// Pop is required by `heap.Interface`. Note that this is not the same as `heap.Pop`! -// expected behavior by `heap` is: "remove and return element Len() - 1". -func (h *rangeIndexHeap) Pop() interface{} { - old := h.indices - n := len(old) - x := old[n-1] - h.indices = old[0 : n-1] - return x -} - -// Less is required by `heap.Interface`. -func (h *rangeIndexHeap) Less(i, j int) bool { - switch h.order { - case FileOrder: - return h.filePositionLess(i, j) - case LogTimeOrder: - if h.timestamp(i) == h.timestamp(j) { - return h.filePositionLess(i, j) - } - return h.timestamp(i) < h.timestamp(j) - case ReverseLogTimeOrder: - if h.timestamp(i) == h.timestamp(j) { - return h.filePositionLess(j, i) - } - return h.timestamp(i) > h.timestamp(j) - } - h.lastErr = fmt.Errorf("ReadOrder case not handled: %v", h.order) - return false -} - -func (h *rangeIndexHeap) HeapPush(ri rangeIndex) error { - heap.Push(h, ri) - return h.lastErr -} - -func (h *rangeIndexHeap) HeapPop() (*rangeIndex, error) { - result := heap.Pop(h).(rangeIndex) - if h.lastErr != nil { - return nil, h.lastErr - } - return &result, nil -} diff --git a/go/mcap/range_index_heap_test.go b/go/mcap/range_index_heap_test.go deleted file mode 100644 index ef8044f059..0000000000 --- a/go/mcap/range_index_heap_test.go +++ /dev/null @@ -1,89 +0,0 @@ -package mcap - -import ( - "reflect" - "testing" - - "github.com/stretchr/testify/assert" - "github.com/stretchr/testify/require" -) - -var rangeIndexHeapTestItems = []rangeIndex{ - { - chunkIndex: &ChunkIndex{ - ChunkStartOffset: 1, - MessageStartTime: 100, - MessageEndTime: 300, - }, - }, - { - chunkIndex: &ChunkIndex{ - ChunkStartOffset: 2, - MessageStartTime: 200, - MessageEndTime: 400, - }, - messageIndexEntry: &MessageIndexEntry{Offset: 3, Timestamp: 200}, - }, - { - chunkIndex: &ChunkIndex{ - ChunkStartOffset: 2, - MessageStartTime: 200, - MessageEndTime: 400, - }, - messageIndexEntry: &MessageIndexEntry{Offset: 2, Timestamp: 250}, - }, - { - chunkIndex: &ChunkIndex{ - ChunkStartOffset: 3, - MessageStartTime: 300, - MessageEndTime: 400, - }, - }, -} - -func TestMessageOrdering(t *testing.T) { - cases := []struct { - assertion string - order ReadOrder - expectedIndexOrder []int - }{ - { - assertion: "read time order forwards", - order: LogTimeOrder, - expectedIndexOrder: []int{0, 1, 2, 3}, - }, - { - assertion: "read time order backwards", - order: ReverseLogTimeOrder, - expectedIndexOrder: []int{3, 0, 2, 1}, - }, - { - assertion: "read file order", - order: FileOrder, - expectedIndexOrder: []int{0, 2, 1, 3}, - }, - } - for _, c := range cases { - t.Run(c.assertion, func(t *testing.T) { - h := &rangeIndexHeap{order: c.order} - for _, item := range rangeIndexHeapTestItems { - require.NoError(t, h.HeapPush(item)) - } - assert.Len(t, rangeIndexHeapTestItems, h.Len()) - i := 0 - for h.Len() > 0 { - poppedItem, err := h.HeapPop() - require.NoError(t, err) - found := false - for index, item := range rangeIndexHeapTestItems { - if reflect.DeepEqual(item, *poppedItem) { - assert.Equal(t, c.expectedIndexOrder[i], index) - found = true - } - } - assert.True(t, found) - i++ - } - }) - } -} diff --git a/go/mcap/reader.go b/go/mcap/reader.go index c6566ef385..d013c85b4e 100644 --- a/go/mcap/reader.go +++ b/go/mcap/reader.go @@ -65,7 +65,14 @@ type Reader struct { } type MessageIterator interface { + // Deprecated: use NextInto to avoid repeatedly heap-allocating Message structs while iterating. Next([]byte) (*Schema, *Channel, *Message, error) + // NextInto returns the next message from the MCAP. If the returned error is io.EOF, + // this signals the end of the MCAP. + // If `msg` is not nil, NextInto will populate it with new data and + // return the same pointer, re-using or resizing `msg.Data` as needed. + // If `msg` is nil, NextInto will allocate and return a new Message on the heap. + NextInto(msg *Message) (*Schema, *Channel, *Message, error) } func Range(it MessageIterator, f func(*Schema, *Channel, *Message) error) error { @@ -93,8 +100,6 @@ func (r *Reader) unindexedIterator(opts *ReadOptions) *unindexedMessageIterator r.l.emitChunks = false return &unindexedMessageIterator{ lexer: r.l, - channels: make(map[uint16]*Channel), - schemas: make(map[uint16]*Schema), topics: topicMap, start: opts.StartNanos, end: opts.EndNanos, @@ -114,12 +119,10 @@ func (r *Reader) indexedMessageIterator( return &indexedMessageIterator{ lexer: r.l, rs: r.rs, - channels: make(map[uint16]*Channel), - schemas: make(map[uint16]*Schema), topics: topicMap, start: opts.StartNanos, end: opts.EndNanos, - indexHeap: rangeIndexHeap{order: opts.Order}, + order: opts.Order, metadataCallback: opts.MetadataCallback, } } @@ -192,11 +195,11 @@ func (r *Reader) Info() (*Info, error) { } info := &Info{ Statistics: it.statistics, - Channels: it.channels, + Channels: it.channels.ToMap(), ChunkIndexes: it.chunkIndexes, AttachmentIndexes: it.attachmentIndexes, MetadataIndexes: it.metadataIndexes, - Schemas: it.schemas, + Schemas: it.schemas.ToMap(), Footer: it.footer, Header: r.header, } diff --git a/go/mcap/reader_test.go b/go/mcap/reader_test.go index 45d540fa50..e5bb35f15f 100644 --- a/go/mcap/reader_test.go +++ b/go/mcap/reader_test.go @@ -2,12 +2,15 @@ package mcap import ( "bytes" + "crypto/rand" + "encoding/binary" "errors" "fmt" "io" "math" "os" "testing" + "time" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" @@ -72,6 +75,7 @@ func TestIndexedReaderBreaksTiesOnChunkOffset(t *testing.T) { if errors.Is(err, io.EOF) { break } + require.NoError(t, err) assert.Equal(t, expectedTopics[i], channel.Topic) } } @@ -754,29 +758,15 @@ func TestReadingMessageOrderWithOverlappingChunks(t *testing.T) { }) require.NoError(t, err) require.NoError(t, writer.WriteHeader(&Header{})) - require.NoError(t, writer.WriteSchema(&Schema{ - ID: 1, - Name: "", - Encoding: "", - Data: []byte{}, - })) - require.NoError(t, writer.WriteChannel(&Channel{ - ID: 0, - Topic: "", - SchemaID: 0, - MessageEncoding: "", - Metadata: map[string]string{ - "": "", - }, - })) + require.NoError(t, writer.WriteSchema(&Schema{ID: 1})) + require.NoError(t, writer.WriteChannel(&Channel{ID: 0})) msgCount := 0 addMsg := func(timestamp uint64) { require.NoError(t, writer.WriteMessage(&Message{ ChannelID: 0, - Sequence: 0, LogTime: timestamp, PublishTime: timestamp, - Data: []byte{'h', 'e', 'l', 'l', 'o'}, + Data: []byte("hello"), })) msgCount++ } @@ -831,7 +821,7 @@ func TestReadingMessageOrderWithOverlappingChunks(t *testing.T) { // check that timestamps monotonically decrease from the returned iterator for i := 0; i < msgCount; i++ { - _, _, msg, err := reverseIt.Next(nil) + _, _, msg, err := reverseIt.NextInto(nil) require.NoError(t, err) if i != 0 { assert.Less(t, msg.LogTime, lastSeenTimestamp) @@ -843,6 +833,90 @@ func TestReadingMessageOrderWithOverlappingChunks(t *testing.T) { require.ErrorIs(t, io.EOF, err) } +func TestOrderStableWithEquivalentTimestamps(t *testing.T) { + buf := &bytes.Buffer{} + // write an MCAP with two chunks, where in each chunk all messages have ascending timestamps, + // but their timestamp ranges overlap. + writer, err := NewWriter(buf, &WriterOptions{ + Chunked: true, + ChunkSize: 200, + Compression: CompressionLZ4, + }) + require.NoError(t, err) + require.NoError(t, writer.WriteHeader(&Header{})) + require.NoError(t, writer.WriteSchema(&Schema{ID: 1})) + require.NoError(t, writer.WriteChannel(&Channel{ID: 0, Topic: "a"})) + require.NoError(t, writer.WriteChannel(&Channel{ID: 1, Topic: "b"})) + var msgCount uint64 + msgData := make([]byte, 8) + for len(writer.ChunkIndexes) < 3 { + binary.LittleEndian.PutUint64(msgData, msgCount) + require.NoError(t, writer.WriteMessage(&Message{ + ChannelID: uint16(msgCount % 2), + LogTime: msgCount % 2, + PublishTime: msgCount % 2, + Data: msgData, + })) + msgCount++ + } + require.NoError(t, writer.Close()) + + reader, err := NewReader(bytes.NewReader(buf.Bytes())) + require.NoError(t, err) + + it, err := reader.Messages( + UsingIndex(true), + InOrder(LogTimeOrder), + ) + require.NoError(t, err) + var lastMessageNumber uint64 + var numRead uint64 + for { + _, _, msg, err := it.NextInto(nil) + if errors.Is(err, io.EOF) { + break + } + require.NoError(t, err) + if msg.ChannelID != 0 { + continue + } + assert.Equal(t, uint64(0), msg.LogTime) + msgNumber := binary.LittleEndian.Uint64(msg.Data) + if numRead != 0 { + assert.Greater(t, msgNumber, lastMessageNumber) + } + lastMessageNumber = msgNumber + numRead++ + } + assert.Equal(t, msgCount/2, numRead) + + reverseIt, err := reader.Messages( + UsingIndex(true), + InOrder(ReverseLogTimeOrder), + ) + require.NoError(t, err) + lastMessageNumber = 0 + numRead = 0 + for { + _, _, msg, err := reverseIt.NextInto(nil) + if errors.Is(err, io.EOF) { + break + } + require.NoError(t, err) + if msg.ChannelID != 0 { + continue + } + assert.Equal(t, uint64(0), msg.LogTime) + msgNumber := binary.LittleEndian.Uint64(msg.Data) + fmt.Printf("msgNumber: %d\n", msgNumber) + if numRead != 0 { + assert.Less(t, msgNumber, lastMessageNumber) + } + lastMessageNumber = msgNumber + numRead++ + } +} + func TestReadingBigTimestamps(t *testing.T) { buf := &bytes.Buffer{} w, err := NewWriter(buf, &WriterOptions{ @@ -881,3 +955,174 @@ func TestReadingBigTimestamps(t *testing.T) { assert.Equal(t, 1, count) }) } + +func BenchmarkReader(b *testing.B) { + inputParameters := []struct { + name string + outOfOrderWithinChunks bool + chunksOverlap bool + }{ + { + name: "msgs_in_order", + }, + { + name: "jitter_in_chunk", + outOfOrderWithinChunks: true, + }, + { + name: "chunks_overlap", + outOfOrderWithinChunks: true, + chunksOverlap: true, + }, + } + for _, inputCfg := range inputParameters { + b.Run(inputCfg.name, func(b *testing.B) { + b.StopTimer() + buf := &bytes.Buffer{} + writer, err := NewWriter(buf, &WriterOptions{ + Chunked: true, + Compression: CompressionZSTD, + }) + require.NoError(b, err) + messageCount := uint64(4000000) + require.NoError(b, writer.WriteHeader(&Header{})) + require.NoError(b, writer.WriteSchema(&Schema{ID: 1, Name: "empty", Encoding: "none"})) + channelCount := 200 + for i := 0; i < channelCount; i++ { + require.NoError(b, writer.WriteChannel(&Channel{ + ID: uint16(i), + SchemaID: 1, + Topic: "/chat", + MessageEncoding: "none", + })) + } + contentBuf := make([]byte, 32) + lastChunkMax := uint64(0) + thisChunkMax := uint64(0) + for i := uint64(0); i < messageCount; i++ { + channelID := uint16(i % uint64(channelCount)) + _, err := rand.Read(contentBuf) + require.NoError(b, err) + timestamp := i + if inputCfg.outOfOrderWithinChunks { + timestamp += (2 * (10 - (i % 10))) + if !inputCfg.chunksOverlap { + if timestamp < lastChunkMax { + timestamp = lastChunkMax + } + } + } + if timestamp > thisChunkMax { + thisChunkMax = timestamp + } + chunkCount := len(writer.ChunkIndexes) + require.NoError(b, writer.WriteMessage(&Message{ + ChannelID: channelID, + Sequence: uint32(i), + LogTime: timestamp, + PublishTime: timestamp, + Data: contentBuf, + })) + if len(writer.ChunkIndexes) != chunkCount { + lastChunkMax = thisChunkMax + } + } + require.NoError(b, writer.Close()) + b.StartTimer() + readerConfigs := []struct { + name string + order ReadOrder + useIndex bool + }{ + { + name: "no_index", + order: FileOrder, + useIndex: false, + }, + { + name: "file_order", + order: FileOrder, + useIndex: true, + }, + { + name: "time_order", + order: LogTimeOrder, + useIndex: true, + }, + { + name: "rev_order", + order: ReverseLogTimeOrder, + useIndex: true, + }, + } + for _, cfg := range readerConfigs { + b.Run(cfg.name, func(b *testing.B) { + for i := 0; i < b.N; i++ { + s := time.Now() + reader, err := NewReader(bytes.NewReader(buf.Bytes())) + require.NoError(b, err) + it, err := reader.Messages(UsingIndex(cfg.useIndex), InOrder(cfg.order)) + require.NoError(b, err) + readMessages := uint64(0) + msgBytes := uint64(0) + msg := Message{} + var lastErr error + orderErrors := 0 + var lastSeenTimestamp uint64 + for { + _, _, msg, err := it.NextInto(&msg) + if err != nil { + lastErr = err + break + } + if cfg.order == LogTimeOrder && msg.LogTime < lastSeenTimestamp { + orderErrors++ + } + if cfg.order == ReverseLogTimeOrder && msg.LogTime > lastSeenTimestamp && readMessages != 0 { + orderErrors++ + } + lastSeenTimestamp = msg.LogTime + readMessages++ + msgBytes += uint64(len(msg.Data)) + } + require.ErrorIs(b, lastErr, io.EOF) + require.Equal(b, messageCount, readMessages) + require.Equal(b, 0, orderErrors) + + b.ReportMetric(float64(messageCount)/time.Since(s).Seconds(), "msg/s") + b.ReportMetric(float64(msgBytes)/(time.Since(s).Seconds()*1024*1024), "MB/s") + } + }) + } + b.Run("bare_lexer", func(b *testing.B) { + for i := 0; i < b.N; i++ { + s := time.Now() + lexer, err := NewLexer(bytes.NewReader(buf.Bytes())) + require.NoError(b, err) + readMessages := uint64(0) + msgBytes := uint64(0) + var p []byte + var lastErr error + for { + token, record, err := lexer.Next(p) + if err != nil { + lastErr = err + break + } + if cap(record) > cap(p) { + p = record + } + if token == TokenMessage { + readMessages++ + msgBytes += uint64(len(record) - 22) + } + } + require.ErrorIs(b, lastErr, io.EOF) + require.Equal(b, messageCount, readMessages) + b.ReportMetric(float64(messageCount)/time.Since(s).Seconds(), "msg/s") + b.ReportMetric(float64(msgBytes)/(time.Since(s).Seconds()*1024*1024), "MB/s") + } + }) + }) + } +} diff --git a/go/mcap/slicemap.go b/go/mcap/slicemap.go new file mode 100644 index 0000000000..d8eb99b706 --- /dev/null +++ b/go/mcap/slicemap.go @@ -0,0 +1,44 @@ +package mcap + +import "math" + +// slicemap is an arraymap implementation with uint16 keys. This is useful for associating a set of +// Schema or Channel records with their IDs. +type slicemap[T any] struct { + items []*T +} + +func (s *slicemap[T]) Set(key uint16, val *T) { + if int(key) >= len(s.items) { + // extend the len() of s.items up to key + 1 + toAdd := int(key) + 1 - len(s.items) + // let append decide how much to expand the capacity of the slice + s.items = append(s.items, make([]*T, toAdd)...) + } + s.items[key] = val +} + +func (s *slicemap[T]) Get(key uint16) *T { + if int(key) >= len(s.items) { + return nil + } + return s.items[key] +} + +func (s *slicemap[T]) Slice() []*T { + return s.items +} + +func (s *slicemap[T]) ToMap() map[uint16]*T { + out := make(map[uint16]*T) + for key, val := range s.items { + if key > math.MaxUint16 { + break + } + if val == nil { + continue + } + out[uint16(key)] = val + } + return out +} diff --git a/go/mcap/slicemap_test.go b/go/mcap/slicemap_test.go new file mode 100644 index 0000000000..e599e499f3 --- /dev/null +++ b/go/mcap/slicemap_test.go @@ -0,0 +1,33 @@ +package mcap + +import ( + "testing" + + "github.com/stretchr/testify/assert" +) + +func TestSlicemapLength(t *testing.T) { + var s slicemap[string] + val := "hello" + assert.Empty(t, s.Slice()) + + // setting the first value expands the slice enough to fit it + s.Set(0, &val) + assert.Equal(t, &val, s.Get(0)) + assert.Len(t, s.Slice(), 1) + + // setting another higher expands the slice enough to fit it + s.Set(5, &val) + assert.Equal(t, &val, s.Get(5)) + assert.Len(t, s.Slice(), 6) + + // setting a value <= len does not expand the slice + s.Set(1, &val) + assert.Equal(t, &val, s.Get(1)) + assert.Len(t, s.Slice(), 6) + + // getting a value > len does not expand the slice + var nilString *string + assert.Equal(t, nilString, s.Get(10)) + assert.Len(t, s.Slice(), 6) +} diff --git a/go/mcap/unindexed_message_iterator.go b/go/mcap/unindexed_message_iterator.go index bce77cc935..5796c8c359 100644 --- a/go/mcap/unindexed_message_iterator.go +++ b/go/mcap/unindexed_message_iterator.go @@ -6,56 +6,67 @@ import ( type unindexedMessageIterator struct { lexer *Lexer - schemas map[uint16]*Schema - channels map[uint16]*Channel + schemas slicemap[Schema] + channels slicemap[Channel] topics map[string]bool start uint64 end uint64 + recordBuf []byte + metadataCallback func(*Metadata) error } func (it *unindexedMessageIterator) Next(p []byte) (*Schema, *Channel, *Message, error) { + msg := &Message{Data: p} + return it.NextInto(msg) +} + +func (it *unindexedMessageIterator) NextInto(msg *Message) (*Schema, *Channel, *Message, error) { + if msg == nil { + msg = &Message{} + } for { - tokenType, record, err := it.lexer.Next(p) + tokenType, record, err := it.lexer.Next(it.recordBuf) if err != nil { return nil, nil, nil, err } + if cap(record) > cap(it.recordBuf) { + it.recordBuf = record + } switch tokenType { case TokenSchema: schema, err := ParseSchema(record) if err != nil { return nil, nil, nil, fmt.Errorf("failed to parse schema: %w", err) } - if _, ok := it.schemas[schema.ID]; !ok { - it.schemas[schema.ID] = schema - } + it.schemas.Set(schema.ID, schema) case TokenChannel: channelInfo, err := ParseChannel(record) if err != nil { return nil, nil, nil, fmt.Errorf("failed to parse channel info: %w", err) } - if _, ok := it.channels[channelInfo.ID]; !ok { - if len(it.topics) == 0 || it.topics[channelInfo.Topic] { - it.channels[channelInfo.ID] = channelInfo - } + if len(it.topics) == 0 || it.topics[channelInfo.Topic] { + it.channels.Set(channelInfo.ID, channelInfo) } case TokenMessage: - message, err := ParseMessage(record) - if err != nil { + if err := msg.PopulateFrom(record, true); err != nil { return nil, nil, nil, err } - if _, ok := it.channels[message.ChannelID]; !ok { + channel := it.channels.Get(msg.ChannelID) + if channel == nil { // skip messages on channels we don't know about. Note that if // an unindexed reader encounters a message it would be // interested in, but has not yet encountered the corresponding // channel ID, it has no option but to skip. continue } - if message.LogTime >= it.start && message.LogTime < it.end { - channel := it.channels[message.ChannelID] - schema := it.schemas[channel.SchemaID] - return schema, channel, message, nil + if msg.LogTime >= it.start && msg.LogTime < it.end { + schema := it.schemas.Get(channel.SchemaID) + if schema == nil && channel.SchemaID != 0 { + return nil, nil, nil, fmt.Errorf("channel %d with unrecognized schema ID %d", msg.ChannelID, channel.SchemaID) + } + return schema, channel, msg, nil } case TokenMetadata: if it.metadataCallback != nil { diff --git a/go/mcap/version.go b/go/mcap/version.go index ceb3bc1aea..8ca3c2ac15 100644 --- a/go/mcap/version.go +++ b/go/mcap/version.go @@ -1,4 +1,4 @@ package mcap // Version of the MCAP library. -var Version = "v1.3.1" +var Version = "v1.4.0" diff --git a/go/ros/ros2db3_to_mcap_test.go b/go/ros/ros2db3_to_mcap_test.go index 8dd02c3a06..603e1a156e 100644 --- a/go/ros/ros2db3_to_mcap_test.go +++ b/go/ros/ros2db3_to_mcap_test.go @@ -67,7 +67,7 @@ func TestDB3MCAPConversion(t *testing.T) { it, err := reader.Messages(mcap.WithTopics([]string{c.expectedTopic})) require.NoError(t, err) for { - schema, channel, message, err := it.Next(nil) + schema, channel, message, err := it.NextInto(nil) if err != nil { if errors.Is(err, io.EOF) { break From c5db32fae1c9e7c4349f758a97cc50f8cb6c4979 Mon Sep 17 00:00:00 2001 From: james-rms Date: Tue, 21 May 2024 09:31:06 +1000 Subject: [PATCH 06/44] cli: sort does not duplicate channels and schemas on every message (#1171) ### Changelog - fixed: `mcap sort` would rewrite the schema and channel for every message in the output file. This is fixed. ### Docs ### Description
BeforeAfter
--- go/cli/mcap/cmd/sort.go | 4 +++- go/cli/mcap/cmd/sort_test.go | 38 +++++++++++++++++++++++++++--------- 2 files changed, 32 insertions(+), 10 deletions(-) diff --git a/go/cli/mcap/cmd/sort.go b/go/cli/mcap/cmd/sort.go index e14df3a2bc..e7dacfda2e 100644 --- a/go/cli/mcap/cmd/sort.go +++ b/go/cli/mcap/cmd/sort.go @@ -123,7 +123,7 @@ func sortFile(w io.Writer, r io.ReadSeeker) error { return fmt.Errorf("failed to read messages: %w", err) } schemas := make(map[uint16]*mcap.Schema) - channels := make(map[uint16]*mcap.Schema) + channels := make(map[uint16]*mcap.Channel) message := mcap.Message{} for { schema, channel, _, err := it.NextInto(&message) @@ -138,6 +138,7 @@ func sortFile(w io.Writer, r io.ReadSeeker) error { if err != nil { return fmt.Errorf("failed to write schema: %w", err) } + schemas[schema.ID] = schema } } if _, ok := channels[channel.ID]; !ok { @@ -145,6 +146,7 @@ func sortFile(w io.Writer, r io.ReadSeeker) error { if err != nil { return fmt.Errorf("failed to write channel: %w", err) } + channels[channel.ID] = channel } err = writer.WriteMessage(&message) if err != nil { diff --git a/go/cli/mcap/cmd/sort_test.go b/go/cli/mcap/cmd/sort_test.go index ee7d0e5c03..3530726771 100644 --- a/go/cli/mcap/cmd/sort_test.go +++ b/go/cli/mcap/cmd/sort_test.go @@ -2,6 +2,8 @@ package cmd import ( "bytes" + "errors" + "io" "testing" "github.com/foxglove/mcap/go/mcap" @@ -62,14 +64,32 @@ func TestSortFile(t *testing.T) { w := &bytes.Buffer{} require.NoError(t, sortFile(w, reader)) - // verify it is now sorted - r, err := mcap.NewReader(bytes.NewReader(w.Bytes())) + lexer, err := mcap.NewLexer(bytes.NewReader(w.Bytes())) require.NoError(t, err) - - it, err := r.Messages(mcap.UsingIndex(false)) - require.NoError(t, err) - - _, _, msg, err := it.NextInto(nil) - require.NoError(t, err) - assert.Equal(t, 25, int(msg.LogTime)) + var schemaCount, channelCount, messageCount int + var lastMessageTime uint64 +top: + for { + token, record, err := lexer.Next(nil) + if errors.Is(err, io.EOF) { + break + } + require.NoError(t, err) + switch token { + case mcap.TokenMessage: + messageCount++ + message, err := mcap.ParseMessage(record) + require.NoError(t, err) + require.GreaterOrEqual(t, message.LogTime, lastMessageTime) + lastMessageTime = message.LogTime + case mcap.TokenSchema: + schemaCount++ + case mcap.TokenChannel: + channelCount++ + case mcap.TokenDataEnd: + break top + } + } + assert.Equal(t, 1, schemaCount, "incorrect schema count") + assert.Equal(t, 2, channelCount, "incorrect channel count") } From ef1daa3685b63a0ec39e4af698111094af068a6f Mon Sep 17 00:00:00 2001 From: james-rms Date: Mon, 27 May 2024 14:10:50 +1000 Subject: [PATCH 07/44] doctor: warn on duplicate matching schemas and channels (#1172) ### Changelog - Added: `mcap doctor` now warns if a schema or channel record is duplicated in the data section. Some writers do this and it wastes space, though reading still works. - Added: `mcap doctor` now prints an error if an MCAP contains messages in indexed chunks where their channel and schema information are not duplicated in the summary section. This is required by the spec: https://mcap.dev/spec#chunk-index-op0x08 ### Docs Spec updated to clarify what channels and schemas must be repeated in the summary section. ### Description
BeforeAfter
--- go/cli/mcap/cmd/doctor.go | 277 ++++++++++++++++++++++----------- go/cli/mcap/cmd/doctor_test.go | 34 +++- website/docs/spec/index.md | 2 +- 3 files changed, 224 insertions(+), 89 deletions(-) diff --git a/go/cli/mcap/cmd/doctor.go b/go/cli/mcap/cmd/doctor.go index c9681b389a..4b21d4c912 100644 --- a/go/cli/mcap/cmd/doctor.go +++ b/go/cli/mcap/cmd/doctor.go @@ -9,6 +9,7 @@ import ( "io" "math" "os" + "reflect" "github.com/fatih/color" "github.com/foxglove/mcap/go/cli/mcap/utils" @@ -26,27 +27,33 @@ var ( type mcapDoctor struct { reader io.ReadSeeker - channels map[uint16]*mcap.Channel - schemas map[uint16]*mcap.Schema + channelsInDataSection map[uint16]*mcap.Channel + schemasInDataSection map[uint16]*mcap.Schema + channelsReferencedInChunksByOffset map[uint64][]uint16 + channelIDsInSummarySection map[uint16]bool + schemaIDsInSummarySection map[uint16]bool // Map from chunk offset to chunk index chunkIndexes map[uint64]*mcap.ChunkIndex + inSummarySection bool + messageCount uint64 minLogTime uint64 maxLogTime uint64 statistics *mcap.Statistics - errorCount uint32 + diagnosis Diagnosis } func (doctor *mcapDoctor) warn(format string, v ...any) { color.Yellow(format, v...) + doctor.diagnosis.Warnings = append(doctor.diagnosis.Warnings, fmt.Sprintf(format, v...)) } func (doctor *mcapDoctor) error(format string, v ...any) { color.Red(format, v...) - doctor.errorCount++ + doctor.diagnosis.Errors = append(doctor.diagnosis.Errors, fmt.Sprintf(format, v...)) } func (doctor *mcapDoctor) fatal(v ...any) { @@ -61,7 +68,101 @@ func (doctor *mcapDoctor) fatalf(format string, v ...any) { os.Exit(1) } -func (doctor *mcapDoctor) examineChunk(chunk *mcap.Chunk) { +func (doctor *mcapDoctor) examineSchema(schema *mcap.Schema) { + if schema.Encoding == "" { + if len(schema.Data) == 0 { + doctor.warn("Schema with ID: %d, Name: %q has empty Encoding and Data fields", schema.ID, schema.Name) + } else { + doctor.error("Schema with ID: %d has empty Encoding but Data contains: %q", schema.ID, string(schema.Data)) + } + } + + if schema.ID == 0 { + doctor.error("Schema.ID 0 is reserved. Do not make Schema records with ID 0.") + } + previous := doctor.schemasInDataSection[schema.ID] + if previous != nil { + if schema.Name != previous.Name { + doctor.error("Two schema records with same ID %d but different names (%q != %q)", + schema.ID, + schema.Name, + previous.Name, + ) + } + if schema.Encoding != previous.Encoding { + doctor.error("Two schema records with same ID %d but different encodings (%q != %q)", + schema.ID, + schema.Encoding, + previous.Encoding, + ) + } + if !bytes.Equal(schema.Data, previous.Data) { + doctor.error("Two schema records with different data present with same ID %d", schema.ID) + } + } + if doctor.inSummarySection { + if previous == nil { + doctor.error("Schema with id %d in summary section does not exist in data section", schema.ID) + } + doctor.schemaIDsInSummarySection[schema.ID] = true + } else { + if previous != nil { + doctor.warn("Duplicate schema records in data section with ID %d", schema.ID) + } + doctor.schemasInDataSection[schema.ID] = schema + } +} + +func (doctor *mcapDoctor) examineChannel(channel *mcap.Channel) { + previous := doctor.channelsInDataSection[channel.ID] + if previous != nil { + if channel.SchemaID != previous.SchemaID { + doctor.error("Two channel records with same ID %d but different schema IDs (%d != %d)", + channel.ID, + channel.SchemaID, + previous.SchemaID, + ) + } + if channel.Topic != previous.Topic { + doctor.error("Two channel records with same ID %d but different topics (%q != %q)", + channel.ID, + channel.Topic, + previous.Topic, + ) + } + if channel.MessageEncoding != previous.MessageEncoding { + doctor.error("Two channel records with same ID %d but different message encodings (%q != %q)", + channel.ID, + channel.MessageEncoding, + previous.MessageEncoding, + ) + } + if !reflect.DeepEqual(channel.Metadata, previous.Metadata) { + doctor.error("Two channel records with different metadata present with same ID %d", + channel.ID) + } + } + if doctor.inSummarySection { + if previous == nil { + doctor.error("Channel with ID %d in summary section does not exist in data section", channel.ID) + } + doctor.channelIDsInSummarySection[channel.ID] = true + } else { + if previous != nil { + doctor.warn("Duplicate channel records in data section with ID %d", channel.ID) + } + doctor.channelsInDataSection[channel.ID] = channel + } + + if channel.SchemaID != 0 { + if _, ok := doctor.schemasInDataSection[channel.SchemaID]; !ok { + doctor.error("Encountered Channel (%d) with unknown Schema (%d)", channel.ID, channel.SchemaID) + } + } +} + +func (doctor *mcapDoctor) examineChunk(chunk *mcap.Chunk, startOffset uint64) { + referencedChannels := make(map[uint16]bool) compressionFormat := mcap.CompressionFormat(chunk.Compression) var uncompressedBytes []byte @@ -90,7 +191,7 @@ func (doctor *mcapDoctor) examineChunk(chunk *mcap.Chunk) { return } default: - doctor.error("Unsupported compression format: %s", chunk.Compression) + doctor.error("Unsupported compression format: %q", chunk.Compression) return } @@ -115,7 +216,7 @@ func (doctor *mcapDoctor) examineChunk(chunk *mcap.Chunk) { EmitChunks: true, }) if err != nil { - doctor.error("Failed to make lexer for chunk bytes", err) + doctor.error("Failed to make lexer for chunk bytes: %s", err) return } defer lexer.Close() @@ -144,47 +245,29 @@ func (doctor *mcapDoctor) examineChunk(chunk *mcap.Chunk) { case mcap.TokenSchema: schema, err := mcap.ParseSchema(data) if err != nil { - doctor.error("Failed to parse schema:", err) + doctor.error("Failed to parse schema: %s", err) } - - if schema.Encoding == "" { - if len(schema.Data) == 0 { - doctor.warn("Schema with ID: %d, Name: %s has empty Encoding and Data fields", schema.ID, schema.Name) - } else { - doctor.error("Schema with ID: %d has empty Encoding but Data contains: %s", schema.ID, string(schema.Data)) - } - } - - if schema.ID == 0 { - doctor.error("Schema.ID 0 is reserved. Do not make Schema records with ID 0.") - } - - doctor.schemas[schema.ID] = schema + doctor.examineSchema(schema) case mcap.TokenChannel: channel, err := mcap.ParseChannel(data) if err != nil { doctor.error("Error parsing Channel: %s", err) } - - doctor.channels[channel.ID] = channel - if channel.SchemaID != 0 { - if _, ok := doctor.schemas[channel.SchemaID]; !ok { - doctor.error("Encountered Channel (%d) with unknown Schema (%d)", channel.ID, channel.SchemaID) - } - } + doctor.examineChannel(channel) case mcap.TokenMessage: message, err := mcap.ParseMessage(data) if err != nil { doctor.error("Error parsing Message: %s", err) } + referencedChannels[message.ChannelID] = true - channel := doctor.channels[message.ChannelID] + channel := doctor.channelsInDataSection[message.ChannelID] if channel == nil { - doctor.error("Got a Message record for channel: %d before a channel info.", message.ChannelID) + doctor.error("Got a Message record for channel: %d before a channel record.", message.ChannelID) } if message.LogTime < doctor.maxLogTime { - errStr := fmt.Sprintf("Message.log_time %d on %s is less than the latest log time %d", + errStr := fmt.Sprintf("Message.log_time %d on %q is less than the latest log time %d", message.LogTime, channel.Topic, doctor.maxLogTime) if strictMessageOrder { doctor.error(errStr) @@ -237,9 +320,19 @@ func (doctor *mcapDoctor) examineChunk(chunk *mcap.Chunk) { doctor.maxLogTime = maxLogTime } } + asArray := make([]uint16, 0, len(referencedChannels)) + for id := range referencedChannels { + asArray = append(asArray, id) + } + doctor.channelsReferencedInChunksByOffset[startOffset] = asArray +} + +type Diagnosis struct { + Errors []string + Warnings []string } -func (doctor *mcapDoctor) Examine() error { +func (doctor *mcapDoctor) Examine() Diagnosis { lexer, err := mcap.NewLexer(doctor.reader, &mcap.LexerOptions{ SkipMagic: false, ValidateChunkCRCs: true, @@ -286,61 +379,37 @@ func (doctor *mcapDoctor) Examine() error { } if header.Profile != "" && header.Profile != "ros1" && header.Profile != "ros2" { - doctor.warn(`Header.profile field "%s" is not a well-known profile.`, header.Profile) + doctor.warn(`Header.profile field %q is not a well-known profile.`, header.Profile) } case mcap.TokenFooter: footer, err = mcap.ParseFooter(data) if err != nil { - doctor.error("Failed to parse footer:", err) + doctor.error("Failed to parse footer: %s", err) } case mcap.TokenSchema: schema, err := mcap.ParseSchema(data) if err != nil { - doctor.error("Failed to parse schema:", err) + doctor.error("Failed to parse schema: %s", err) } - - if schema.Encoding == "" { - if len(schema.Data) == 0 { - doctor.warn("Schema with ID: %d, Name: %s has empty Encoding and Data fields", schema.ID, schema.Name) - } else { - doctor.error("Schema with ID: %d has empty Encoding but Data contains: %s", schema.ID, string(schema.Data)) - } - } - - if schema.ID == 0 { - doctor.error("Schema.ID 0 is reserved. Do not make Schema records with ID 0.") - } - - doctor.schemas[schema.ID] = schema + doctor.examineSchema(schema) case mcap.TokenChannel: channel, err := mcap.ParseChannel(data) if err != nil { doctor.error("Error parsing Channel: %s", err) } - - doctor.channels[channel.ID] = channel - - if channel.SchemaID != 0 { - if _, ok := doctor.schemas[channel.SchemaID]; !ok { - doctor.error( - "Encountered Channel (%d) with unknown Schema (%d)", - channel.ID, - channel.SchemaID, - ) - } - } + doctor.examineChannel(channel) case mcap.TokenMessage: message, err := mcap.ParseMessage(data) if err != nil { doctor.error("Error parsing Message: %s", err) } messageOutsideChunk = true - channel := doctor.channels[message.ChannelID] + channel := doctor.channelsInDataSection[message.ChannelID] if channel == nil { doctor.error("Got a Message record for channel: %d before a channel info.", message.ChannelID) } if message.LogTime < lastMessageTime { - doctor.error("Message.log_time %d on %s is less than the previous message record time %d", + doctor.error("Message.log_time %d on %q is less than the previous message record time %d", message.LogTime, channel.Topic, lastMessageTime) } lastMessageTime = message.LogTime @@ -359,11 +428,17 @@ func (doctor *mcapDoctor) Examine() error { if err != nil { doctor.error("Error parsing Message: %s", err) } - doctor.examineChunk(chunk) + pos, err := doctor.reader.Seek(0, io.SeekCurrent) + if err != nil { + // cannot continue if seek fails + doctor.fatalf("failed to determine read cursor: %s", err) + } + chunkStartOffset := uint64(pos - int64(len(data)) - 9) + doctor.examineChunk(chunk, chunkStartOffset) case mcap.TokenMessageIndex: _, err := mcap.ParseMessageIndex(data) if err != nil { - doctor.error("Failed to parse message index:", err) + doctor.error("Failed to parse message index: %s", err) } if messageOutsideChunk { doctor.warn("Message index in file has message records outside chunks. Indexed readers will miss these messages.") @@ -371,24 +446,24 @@ func (doctor *mcapDoctor) Examine() error { case mcap.TokenChunkIndex: chunkIndex, err := mcap.ParseChunkIndex(data) if err != nil { - doctor.error("Failed to parse chunk index:", err) + doctor.error("Failed to parse chunk index: %s", err) } if messageOutsideChunk { doctor.warn("Message index in file has message records outside chunks. Indexed readers will miss these messages.") } if _, ok := doctor.chunkIndexes[chunkIndex.ChunkStartOffset]; ok { - doctor.error("Multiple chunk indexes found for chunk at offset", chunkIndex.ChunkStartOffset) + doctor.error("Multiple chunk indexes found for chunk at offset %d", chunkIndex.ChunkStartOffset) } doctor.chunkIndexes[chunkIndex.ChunkStartOffset] = chunkIndex case mcap.TokenAttachmentIndex: _, err := mcap.ParseAttachmentIndex(data) if err != nil { - doctor.error("Failed to parse attachment index:", err) + doctor.error("Failed to parse attachment index: %s", err) } case mcap.TokenStatistics: statistics, err := mcap.ParseStatistics(data) if err != nil { - doctor.error("Failed to parse statistics:", err) + doctor.error("Failed to parse statistics: %s", err) } if doctor.statistics != nil { doctor.error("File contains multiple Statistics records") @@ -397,23 +472,24 @@ func (doctor *mcapDoctor) Examine() error { case mcap.TokenMetadata: _, err := mcap.ParseMetadata(data) if err != nil { - doctor.error("Failed to parse metadata:", err) + doctor.error("Failed to parse metadata: %s", err) } case mcap.TokenMetadataIndex: _, err := mcap.ParseMetadataIndex(data) if err != nil { - doctor.error("Failed to parse metadata index:", err) + doctor.error("Failed to parse metadata index: %s", err) } case mcap.TokenSummaryOffset: _, err := mcap.ParseSummaryOffset(data) if err != nil { - doctor.error("Failed to parse summary offset:", err) + doctor.error("Failed to parse summary offset: %s", err) } case mcap.TokenDataEnd: dataEnd, err = mcap.ParseDataEnd(data) if err != nil { - doctor.error("Failed to parse data end:", err) + doctor.error("Failed to parse data end: %s", err) } + doctor.inSummarySection = true case mcap.TokenError: // this is the value of the tokenType when there is an error // from the lexer, which we caught at the top. @@ -422,9 +498,32 @@ func (doctor *mcapDoctor) Examine() error { } for chunkOffset, chunkIndex := range doctor.chunkIndexes { + channelsReferenced := doctor.channelsReferencedInChunksByOffset[chunkOffset] + for _, id := range channelsReferenced { + if present := doctor.channelIDsInSummarySection[id]; !present { + doctor.error( + "Indexed chunk at offset %d contains messages referencing channel (%d) not duplicated in summary section", + chunkOffset, + id, + ) + } + channel := doctor.channelsInDataSection[id] + if channel == nil { + // message with unknown channel, this is checked when that message is scanned + continue + } + if present := doctor.schemaIDsInSummarySection[channel.SchemaID]; !present { + doctor.error( + "Indexed chunk at offset %d contains messages referencing schema (%d) not duplicated in summary section", + chunkOffset, + channel.SchemaID, + ) + } + } + _, err := doctor.reader.Seek(int64(chunkOffset), io.SeekStart) if err != nil { - die("failed to seek to chunk offset: %s", err) + doctor.fatalf("failed to seek to chunk offset: %s", err) } tokenType, data, err := lexer.Next(msg) if err != nil { @@ -475,7 +574,7 @@ func (doctor *mcapDoctor) Examine() error { } if chunk.Compression != chunkIndex.Compression.String() { doctor.error( - "Chunk at offset %d has compression %s, but its chunk index has compression %s", + "Chunk at offset %d has compression %q, but its chunk index has compression %q", chunkOffset, chunk.Compression, chunkIndex.Compression, @@ -483,7 +582,7 @@ func (doctor *mcapDoctor) Examine() error { } if uint64(len(chunk.Records)) != chunkIndex.CompressedSize { doctor.error( - "Chunk at offset %d has data length %d, but its chunk index has compressed size %s", + "Chunk at offset %d has data length %d, but its chunk index has compressed size %d", chunkOffset, len(chunk.Records), chunkIndex.CompressedSize, @@ -524,19 +623,19 @@ func (doctor *mcapDoctor) Examine() error { ) } } - if doctor.errorCount == 0 { - return nil - } - return fmt.Errorf("encountered %d errors", doctor.errorCount) + return doctor.diagnosis } func newMcapDoctor(reader io.ReadSeeker) *mcapDoctor { return &mcapDoctor{ - reader: reader, - channels: make(map[uint16]*mcap.Channel), - schemas: make(map[uint16]*mcap.Schema), - chunkIndexes: make(map[uint64]*mcap.ChunkIndex), - minLogTime: math.MaxUint64, + reader: reader, + channelsInDataSection: make(map[uint16]*mcap.Channel), + channelsReferencedInChunksByOffset: make(map[uint64][]uint16), + channelIDsInSummarySection: make(map[uint16]bool), + schemaIDsInSummarySection: make(map[uint16]bool), + schemasInDataSection: make(map[uint16]*mcap.Schema), + chunkIndexes: make(map[uint64]*mcap.ChunkIndex), + minLogTime: math.MaxUint64, } } @@ -555,7 +654,11 @@ func main(_ *cobra.Command, args []string) { if verbose { fmt.Printf("Examining %s\n", args[0]) } - return doctor.Examine() + diagnosis := doctor.Examine() + if len(diagnosis.Errors) > 0 { + return fmt.Errorf("encountered %d errors", len(diagnosis.Errors)) + } + return nil }) if err != nil { die("Doctor command failed: %s", err) diff --git a/go/cli/mcap/cmd/doctor_test.go b/go/cli/mcap/cmd/doctor_test.go index f191f6e1dc..3b6b47c629 100644 --- a/go/cli/mcap/cmd/doctor_test.go +++ b/go/cli/mcap/cmd/doctor_test.go @@ -2,9 +2,11 @@ package cmd import ( "bytes" + "os" "testing" "github.com/foxglove/mcap/go/mcap" + "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" ) @@ -29,6 +31,36 @@ func TestNoErrorOnMessagelessChunks(t *testing.T) { rs := bytes.NewReader(buf.Bytes()) doctor := newMcapDoctor(rs) - err = doctor.Examine() + diagnosis := doctor.Examine() + assert.Empty(t, diagnosis.Errors) +} + +func TestRequiresDuplicatedSchemasForIndexedMessages(t *testing.T) { + rs, err := os.Open("../../../../tests/conformance/data/OneMessage/OneMessage-ch-chx-pad.mcap") require.NoError(t, err) + defer func() { + require.NoError(t, rs.Close()) + }() + doctor := newMcapDoctor(rs) + diagnosis := doctor.Examine() + assert.Len(t, diagnosis.Errors, 2) + assert.Equal(t, + "Indexed chunk at offset 28 contains messages referencing channel (1) not duplicated in summary section", + diagnosis.Errors[0], + ) + assert.Equal(t, + "Indexed chunk at offset 28 contains messages referencing schema (1) not duplicated in summary section", + diagnosis.Errors[1], + ) +} + +func TestPassesIndexedMessagesWithRepeatedSchemas(t *testing.T) { + rs, err := os.Open("../../../../tests/conformance/data/OneMessage/OneMessage-ch-chx-pad-rch-rsh.mcap") + require.NoError(t, err) + defer func() { + require.NoError(t, rs.Close()) + }() + doctor := newMcapDoctor(rs) + diagnosis := doctor.Examine() + assert.Empty(t, diagnosis.Errors) } diff --git a/website/docs/spec/index.md b/website/docs/spec/index.md index 7d49891cc3..05b6794e83 100644 --- a/website/docs/spec/index.md +++ b/website/docs/spec/index.md @@ -225,7 +225,7 @@ A Chunk Index record exists for every Chunk in the file. | 8 | compressed_size | uint64 | The size of the chunk `records` field. | | 8 | uncompressed_size | uint64 | The uncompressed size of the chunk `records` field. This field should match the value in the corresponding Chunk record. | -A Schema and Channel record MUST exist in the summary section for all channels referenced by chunk index records. +A Schema and Channel record MUST exist in the summary section for all messages in chunks that are indexed by Chunk Index records. > Why? The typical use case for file readers using an index is fast random access to a specific message timestamp. Channel is a prerequisite for decoding Message record data. Without an easy-to-access copy of the Channel records, readers would need to search for Channel records from the start of the file, degrading random access read performance. From 5c1f9149dae8396635a236d12bd4e414c9f7200e Mon Sep 17 00:00:00 2001 From: kyle-basis Date: Wed, 29 May 2024 18:39:55 -0700 Subject: [PATCH 08/44] Fix capitalization error on protobuf wiki page (#1174) ### Changelog `Protobuf`->`protobuf` ### Docs None ### Description One of these versions compiles, the other does not. --- website/docs/guides/cpp/protobuf.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/website/docs/guides/cpp/protobuf.md b/website/docs/guides/cpp/protobuf.md index 89affa05e6..01ee6760de 100644 --- a/website/docs/guides/cpp/protobuf.md +++ b/website/docs/guides/cpp/protobuf.md @@ -103,7 +103,7 @@ auto messageView = reader.readMessages(); #### Load schema definitions -We build a `DynamicMessageFactory`, using a `google::Protobuf::SimpleDescriptorDatabase` as the underlying descriptor database. By constructing this ourselves and retaining a reference to the database, we can more easily load that database with definitions from the MCAP file. +We build a `DynamicMessageFactory`, using a `google::protobuf::SimpleDescriptorDatabase` as the underlying descriptor database. By constructing this ourselves and retaining a reference to the database, we can more easily load that database with definitions from the MCAP file. ```cpp gp::SimpleDescriptorDatabase protoDb; From 5b805219a9c8888d621ddf9bef2ffdafb4c5e1f3 Mon Sep 17 00:00:00 2001 From: kyle-basis Date: Thu, 30 May 2024 10:51:42 -0700 Subject: [PATCH 09/44] Stop example protobuf code from leaking (#1175) ### Changelog protobufs allocated with `New()` must be `delete`d later. ### Docs None ### Description
BeforeAfter
Leaks Doesn't leak (leaks less?)
--------- Co-authored-by: Jacob Bandes-Storch Co-authored-by: Hans-Joachim Krauch --- cpp/examples/protobuf/dynamic_reader.cpp | 3 ++- website/docs/guides/cpp/protobuf.md | 8 +++++++- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/cpp/examples/protobuf/dynamic_reader.cpp b/cpp/examples/protobuf/dynamic_reader.cpp index 5c38547765..2476415d1a 100644 --- a/cpp/examples/protobuf/dynamic_reader.cpp +++ b/cpp/examples/protobuf/dynamic_reader.cpp @@ -8,6 +8,7 @@ #include #include "mcap/reader.hpp" +#include #include namespace gp = google::protobuf; @@ -82,7 +83,7 @@ int main(int argc, char** argv) { return 1; } } - gp::Message* message = protoFactory.GetPrototype(descriptor)->New(); + auto message = std::unique_ptr(protoFactory.GetPrototype(descriptor)->New()); if (!message->ParseFromArray(it->message.data, static_cast(it->message.dataSize))) { std::cerr << "failed to parse message using included foxglove.PointCloud schema" << std::endl; reader.close(); diff --git a/website/docs/guides/cpp/protobuf.md b/website/docs/guides/cpp/protobuf.md index 01ee6760de..4930db5da8 100644 --- a/website/docs/guides/cpp/protobuf.md +++ b/website/docs/guides/cpp/protobuf.md @@ -31,6 +31,12 @@ We also include the MCAP reader implementation: #include "mcap/reader.hpp" ``` +And standard library dependencies: + +```cpp +#include +``` + Use the `mcap::McapReader::open()` method to open an MCAP file for reading: ```cpp @@ -157,7 +163,7 @@ descriptor = protoPool.FindMessageTypeByName(it->schema->name); We can use this descriptor to parse our message: ```cpp -gp::Message* message = protoFactory.GetPrototype(descriptor)->New(); +auto message = std::unique_ptr(protoFactory.GetPrototype(descriptor)->New()); if (!message->ParseFromArray(static_cast(it->message.data), it->message.dataSize)) { std::cerr << "failed to parse message using included schema" << std::endl; From e03e3ebe82b7de42375981d46d4bce46bdeb303a Mon Sep 17 00:00:00 2001 From: Jacob Bandes-Storch Date: Fri, 31 May 2024 16:31:39 -0700 Subject: [PATCH 10/44] Clarify instructions for generating binary protobuf schemas using protoc (#1177) Clarifying that `--include_imports` is necessary. Adding this flag ensures all dependencies are also included in the binary schema. --- website/docs/spec/registry.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/website/docs/spec/registry.md b/website/docs/spec/registry.md index eb50a9c5ec..722f143157 100644 --- a/website/docs/spec/registry.md +++ b/website/docs/spec/registry.md @@ -60,7 +60,7 @@ Schema `encoding` may only be omitted for self-describing message encodings such - `name`: Fully qualified name to the message within the descriptor set. For example, in a proto file containing `package foo.bar; message Baz {}` the fully qualified message name is `foo.bar.Baz`. - `encoding`: `protobuf` -- `data`: A binary [FileDescriptorSet](https://github.com/protocolbuffers/protobuf/blob/master/src/google/protobuf/descriptor.proto) as produced by `protoc --descriptor_set_out`. +- `data`: A binary [FileDescriptorSet](https://github.com/protocolbuffers/protobuf/blob/master/src/google/protobuf/descriptor.proto) as produced by `protoc --include_imports --descriptor_set_out`. ### flatbuffer From f4d29c608013deea47de5fab0c4a2299f6815912 Mon Sep 17 00:00:00 2001 From: Jacob Bandes-Storch Date: Mon, 3 Jun 2024 12:02:16 -0700 Subject: [PATCH 11/44] Remove close() call from protobuf writer example (#1176) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ### Changelog None ### Docs None ### Description Calling close() after terminate() doesn't make sense for a couple reasons. First, close() itself calls terminate(). Second, close() attempts to write additional data to the file, and we are doing it here after a write already failed, so it might not be expected to succeed. On the other hand, it looks like the possible error codes from write() are NotOpen, InvalidChannelId, or InvalidSchemaId. In all of these cases it might actually be fine to call close(), as long as we didn't already call terminate()? On the third hand, ~McapWriter also calls close() so maybe we completely remove both calls? 🤷 --- cpp/examples/protobuf/writer.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/cpp/examples/protobuf/writer.cpp b/cpp/examples/protobuf/writer.cpp index a3c28a2481..645814c54c 100644 --- a/cpp/examples/protobuf/writer.cpp +++ b/cpp/examples/protobuf/writer.cpp @@ -156,7 +156,6 @@ int main(int argc, char** argv) { if (!res.ok()) { std::cerr << "Failed to write message: " << res.message << "\n"; writer.terminate(); - writer.close(); std::ignore = std::remove(outputFilename); return 1; } From f229449bc84942540912db0267a086763fbaf714 Mon Sep 17 00:00:00 2001 From: james-rms Date: Fri, 7 Jun 2024 05:27:39 +1000 Subject: [PATCH 12/44] go: reader: read files with chunk indexes but not message indexes correctly. (#1179) ### Changelog - Fixed: if reading an MCAP file written with chunk index records but no message index records, the go MCAP reader would mistakenly yield no messages. - Fixed: the indexed MCAP reader would occasionally read too many bytes when reading a chunk. ### Docs None. ### Description #### Message indexes Right now the go indexed reader does not rely on message index records for anything except to check if a chunk contains messages for that topic. This is incorrect - if a chunk index does not contain message index offsets, it can mean either: - there are no messages in this chunk - message indexes are not available for this chunk See the spec: https://mcap.dev/spec#chunk-index-op0x08 #### chunk buffer sizing - When resizing the chunk buffer between chunk reads, the reader grows the chunk buffer by a multiple of the new chunk size if it's not big enough. However, it would not correctly set the `len` for that buffer, causing the next read to read more past the end of the chunk record into the chunk buffer. This does not cause the reader to yield incorrect information because we use use the contents of the chunk to determine how long the compressed data in it is. However, it causes more bytes to be read than neccessary. Also, it's possible to construct an MCAP where this would cause an "unexpected EOF" error. --- go/mcap/indexed_message_iterator.go | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/go/mcap/indexed_message_iterator.go b/go/mcap/indexed_message_iterator.go index d351326e13..5aae96f08d 100644 --- a/go/mcap/indexed_message_iterator.go +++ b/go/mcap/indexed_message_iterator.go @@ -147,13 +147,8 @@ func (it *indexedMessageIterator) parseSummarySection() error { return fmt.Errorf("failed to parse attachment index: %w", err) } // if the chunk overlaps with the requested parameters, load it - for _, channel := range it.channels.Slice() { - if channel != nil && idx.MessageIndexOffsets[channel.ID] > 0 { - if (it.end == 0 && it.start == 0) || (idx.MessageStartTime < it.end && idx.MessageEndTime >= it.start) { - it.chunkIndexes = append(it.chunkIndexes, idx) - } - break - } + if (it.end == 0 && it.start == 0) || (idx.MessageStartTime < it.end && idx.MessageEndTime >= it.start) { + it.chunkIndexes = append(it.chunkIndexes, idx) } case TokenStatistics: stats, err := ParseStatistics(record) @@ -200,8 +195,8 @@ func (it *indexedMessageIterator) loadChunk(chunkIndex *ChunkIndex) error { compressedChunkLength := chunkIndex.ChunkLength if uint64(cap(it.recordBuf)) < compressedChunkLength { - newSize := int(float64(compressedChunkLength) * chunkBufferGrowthMultiple) - it.recordBuf = make([]byte, newSize) + newCapacity := int(float64(compressedChunkLength) * chunkBufferGrowthMultiple) + it.recordBuf = make([]byte, compressedChunkLength, newCapacity) } else { it.recordBuf = it.recordBuf[:compressedChunkLength] } From 51980dd3673c5e96748feb1e010126c2a9c369f3 Mon Sep 17 00:00:00 2001 From: james-rms Date: Fri, 7 Jun 2024 05:44:30 +1000 Subject: [PATCH 13/44] go: bump version to v1.4.1 (#1181) increasing the version number so we can tag a new go version. --- go/mcap/version.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/go/mcap/version.go b/go/mcap/version.go index 8ca3c2ac15..16fa2d69e8 100644 --- a/go/mcap/version.go +++ b/go/mcap/version.go @@ -1,4 +1,4 @@ package mcap // Version of the MCAP library. -var Version = "v1.4.0" +var Version = "v1.4.1" From 179e67596e197e3da7bb66dd2b0adfb2027a1514 Mon Sep 17 00:00:00 2001 From: Maf <65976562+A-K-O-R-A@users.noreply.github.com> Date: Mon, 24 Jun 2024 20:25:24 +0200 Subject: [PATCH 14/44] rust: Make RawMessage fields public (#1184) ### Changelog RawMessage fields are now public ### Docs None ### Description Currently the `RawMessageStream` is useless because the API does not expose the fields of the `RawMessage` struct. To solve this issue the fields of `RawMessages` are now marked as `pub`. ### References This PR is the same as https://github.com/foxglove/mcap/pull/934 --- rust/src/read.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/rust/src/read.rs b/rust/src/read.rs index 414c062b21..cd6808c089 100644 --- a/rust/src/read.rs +++ b/rust/src/read.rs @@ -624,8 +624,8 @@ impl<'a> RawMessageStream<'a> { } pub struct RawMessage<'a> { - header: records::MessageHeader, - data: Cow<'a, [u8]>, + pub header: records::MessageHeader, + pub data: Cow<'a, [u8]>, } impl<'a> Iterator for RawMessageStream<'a> { From 680fd0deae439abf67680c81961eae15c1d7d93d Mon Sep 17 00:00:00 2001 From: Jacob Bandes-Storch Date: Mon, 24 Jun 2024 12:35:20 -0700 Subject: [PATCH 15/44] validate: check both message encoding and schema encoding (#1186) The validate example was incorrect for ros2msg/cdr channels, and it was also only checking the message encoding and not the schema encoding. --- typescript/examples/validate/scripts/validate.ts | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/typescript/examples/validate/scripts/validate.ts b/typescript/examples/validate/scripts/validate.ts index 150186f794..457bec0fcc 100644 --- a/typescript/examples/validate/scripts/validate.ts +++ b/typescript/examples/validate/scripts/validate.ts @@ -139,7 +139,7 @@ async function validate( throw new Error(`Missing schema ${record.schemaId} for channel ${record.id}`); } let messageDeserializer: (data: ArrayBufferView) => unknown; - if (record.messageEncoding === "ros1") { + if (schema.encoding === "ros1msg" && record.messageEncoding === "ros1") { const reader = new ROS1LazyMessageReader( parseMessageDefinition(new TextDecoder().decode(schema.data)), ); @@ -150,14 +150,14 @@ async function validate( } return reader.readMessage(data).toJSON(); }; - } else if (record.messageEncoding === "ros2") { + } else if (schema.encoding === "ros2msg" && record.messageEncoding === "cdr") { const reader = new ROS2MessageReader( parseMessageDefinition(new TextDecoder().decode(schema.data), { ros2: true, }), ); messageDeserializer = (data) => reader.readMessage(data); - } else if (record.messageEncoding === "protobuf") { + } else if (schema.encoding === "protobuf" && record.messageEncoding === "protobuf") { const root = protobufjs.Root.fromDescriptor(FileDescriptorSet.decode(schema.data)); const type = root.lookupType(schema.name); @@ -167,7 +167,9 @@ async function validate( const textDecoder = new TextDecoder(); messageDeserializer = (data) => JSON.parse(textDecoder.decode(data)); } else { - throw new Error(`unsupported encoding ${record.messageEncoding}`); + throw new Error( + `unsupported message encoding ${record.messageEncoding} with schema encoding ${schema.encoding}`, + ); } channelInfoById.set(record.id, { info: record, messageDeserializer }); break; From b9c9778886b4f35353034349a52382a5db76ce22 Mon Sep 17 00:00:00 2001 From: Jacob Bandes-Storch Date: Mon, 24 Jun 2024 14:28:23 -0700 Subject: [PATCH 16/44] Bump rust to 0.9.1 (#1187) Releasing changes from #1184 --- rust/Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rust/Cargo.toml b/rust/Cargo.toml index d194adf369..6ce4a85fef 100644 --- a/rust/Cargo.toml +++ b/rust/Cargo.toml @@ -7,7 +7,7 @@ categories = [ "science::robotics", "compression" ] repository = "https://github.com/foxglove/mcap" documentation = "https://docs.rs/mcap" readme = "README.md" -version = "0.9.0" +version = "0.9.1" edition = "2021" license = "MIT" From 037ceb7be9ba756afb32fcf43cbc711327fa5e92 Mon Sep 17 00:00:00 2001 From: james-rms Date: Tue, 25 Jun 2024 09:07:21 +1000 Subject: [PATCH 17/44] Typescript: make parsing faster (#1185) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ### Changelog - Typescript: improved record parsing time. ### Docs None. ### Description I was trying to port some of the improvements from https://github.com/foxglove/mcap/pull/1168 to typescript, and noticed that this change to `parse.ts` improves reading speed significantly with no additional API changes. This change switches from `new Uint8Array(fromBuffer.slice(start, end)` to `new Uint8Array(fromBuffer, start, length).slice()`. These both have the same effect of producing a new Uint8Array with a copy of some part of `fromBuffer`. I also rewrote the typescript benchmark utility to remove the dependency on `benny`, so that I could gather memory statistics as part of the benchmark. #### Before: ``` (python-sNIFi2pF) j@192-168-1-105 benchmarks % yarn bench --suite reader Running 'reader' suite McapStreamReader 1.29±0.02 op/s Heap Used: 259.38±4.95 MB/op Heap Total: 288.73±3.47 MB/op ArrayBuffers: 214.31±1.60 MB/op McapIndexedReader 0.95±0.01 op/s Heap Used: 248.20±21.02 MB/op Heap Total: 281.67±20.46 MB/op ArrayBuffers: 98.33±5.42 MB/op McapIndexedReader_reverse 1.00±0.02 op/s Heap Used: 260.77±17.31 MB/op Heap Total: 295.44±17.31 MB/op ArrayBuffers: 102.49±4.62 MB/op ``` #### After ``` (python-sNIFi2pF) j@192-168-1-105 benchmarks % yarn bench --suite reader Running 'reader' suite McapStreamReader 3.04±0.02 op/s Heap Used: 261.58±8.20 MB/op Heap Total: 289.82±6.71 MB/op ArrayBuffers: 214.18±1.60 MB/op McapIndexedReader 1.83±0.01 op/s Heap Used: 281.22±4.06 MB/op Heap Total: 317.18±2.63 MB/op ArrayBuffers: 107.06±0.00 MB/op McapIndexedReader_reverse 1.86±0.00 op/s Heap Used: 278.37±2.01 MB/op Heap Total: 313.65±0.93 MB/op ArrayBuffers: 107.06±0.00 MB/op ``` --- typescript/benchmarks/bench.ts | 107 +++++++++++++++ typescript/benchmarks/index.ts | 155 ++++++++++++++++------ typescript/benchmarks/package.json | 6 +- typescript/core/package.json | 2 +- typescript/core/src/parse.ts | 43 +++--- yarn.lock | 203 ++--------------------------- 6 files changed, 257 insertions(+), 259 deletions(-) create mode 100644 typescript/benchmarks/bench.ts diff --git a/typescript/benchmarks/bench.ts b/typescript/benchmarks/bench.ts new file mode 100644 index 0000000000..d0baead94f --- /dev/null +++ b/typescript/benchmarks/bench.ts @@ -0,0 +1,107 @@ +import { hrtime, memoryUsage } from "process"; + +const COUNT = 5; + +type BenchmarkResult = + | { + name: string; + gcExposed: true; + samples: { + duration: bigint; + memoryUsage: { + rss: number; + heapTotal: number; + heapUsed: number; + external: number; + arrayBuffers: number; + }; + }[]; + } + | { + name: string; + gcExposed: false; + samples: { + duration: bigint; + }[]; + }; + +/** runs a benchmark and logs statistics about runtime and memory usage afterward. + * + * @param name A name for the benchmark. + * @param run a routine that runs the benchmark code. + */ +export async function runBenchmark(name: string, run: () => Promise): Promise { + let result: BenchmarkResult; + if (global.gc != undefined) { + result = { + name, + gcExposed: true, + samples: [], + }; + for (let i = 0; i < COUNT; i++) { + global.gc(); + const before = hrtime.bigint(); + await run(); + const after = hrtime.bigint(); + result.samples.push({ + duration: after - before, + memoryUsage: memoryUsage(), + }); + } + } else { + result = { + name, + gcExposed: false, + samples: [], + }; + for (let i = 0; i < COUNT; i++) { + const before = hrtime.bigint(); + await run(); + const after = hrtime.bigint(); + result.samples.push({ duration: after - before }); + } + } + printStats(result); +} + +function humanReadableStatistics(values: number[], unit: string): string { + const count = values.length; + if (count < 1) { + return "(No samples)"; + } + if (count < 2) { + return `${values[0]} ${unit}`; + } + const mean = values.reduce((a, b) => a + b, 0) / count; + const stdDev = Math.sqrt( + values.map((value) => (mean - value) ** 2).reduce((a, b) => a + b, 0) / (count - 1), + ); + const stdErr = stdDev / Math.sqrt(count); + return `${mean.toFixed(2)}±${stdErr.toFixed(2)} ${unit}`; +} + +function printStats(result: BenchmarkResult) { + let memoryResult = "(use --expose-gc to gather memory statistics)"; + if (result.gcExposed) { + const used = humanReadableStatistics( + result.samples.map((sample) => sample.memoryUsage.heapUsed / 2 ** 20), + "MB/op", + ); + const total = humanReadableStatistics( + result.samples.map((sample) => sample.memoryUsage.heapTotal / 2 ** 20), + "MB/op", + ); + const arrayBuffers = humanReadableStatistics( + result.samples.map((sample) => sample.memoryUsage.arrayBuffers / 2 ** 20), + "MB/op", + ); + memoryResult = `Heap Used: ${used}\tHeap Total: ${total}\tArrayBuffers: ${arrayBuffers}`; + } + const name = result.name; + const timeStat = humanReadableStatistics( + result.samples.map((r) => 1 / (Number(r.duration) / 1e9)), + "op/s", + ); + console.log(name); + console.log(`\t${timeStat}\t${memoryResult}`); +} diff --git a/typescript/benchmarks/index.ts b/typescript/benchmarks/index.ts index 19f8069406..818f8f6073 100644 --- a/typescript/benchmarks/index.ts +++ b/typescript/benchmarks/index.ts @@ -1,5 +1,8 @@ -import { McapWriter } from "@mcap/core"; -import { add, complete, cycle, suite } from "benny"; +import { McapIndexedReader, McapStreamReader, McapWriter, TempBuffer } from "@mcap/core"; +import assert from "assert"; +import { program } from "commander"; + +import { runBenchmark } from "./bench"; /** * An IWritable that copies data to memory, but overwrites previous data. This allows benchmarking @@ -30,7 +33,78 @@ class FakeMemoryWritable { } } -function addWriteBenchmark({ +async function benchmarkReaders() { + const messageSize = 10; + const chunkSize = 1024 * 1024 * 4; + const numMessages = 1_000_000; + const messageData = new Uint8Array(messageSize).fill(42); + const buf = new TempBuffer(); + const writer = new McapWriter({ writable: buf, chunkSize }); + await writer.start({ library: "", profile: "" }); + const channelId = await writer.registerChannel({ + schemaId: 0, + topic: "", + messageEncoding: "", + metadata: new Map([]), + }); + for (let i = 0; i < numMessages; i++) { + await writer.addMessage({ + channelId, + sequence: i, + logTime: BigInt(i), + publishTime: BigInt(i), + data: messageData, + }); + } + await writer.end(); + await runBenchmark(McapStreamReader.name, async () => { + const reader = new McapStreamReader(); + reader.append(buf.get()); + let messageCount = 0; + for (;;) { + const rec = reader.nextRecord(); + if (rec != undefined) { + if (rec.type === "Message") { + messageCount++; + } + } else { + break; + } + } + assert(messageCount === numMessages, `expected ${numMessages} messages, got ${messageCount}`); + }); + await runBenchmark(McapIndexedReader.name, async () => { + const reader = await McapIndexedReader.Initialize({ readable: buf }); + let messageCount = 0; + for await (const _ of reader.readMessages()) { + messageCount++; + } + assert(messageCount === numMessages, `expected ${numMessages} messages, got ${messageCount}`); + }); + await runBenchmark(McapIndexedReader.name + "_reverse", async () => { + const reader = await McapIndexedReader.Initialize({ readable: buf }); + let messageCount = 0; + for await (const _ of reader.readMessages({ reverse: true })) { + messageCount++; + } + assert(messageCount === numMessages, `expected ${numMessages} messages, got ${messageCount}`); + }); +} + +export async function benchmarkWriter(): Promise { + await runWriteBenchmark({ numMessages: 1_000_000, messageSize: 1, chunkSize: 1024 * 1024 }); + await runWriteBenchmark({ numMessages: 100_000, messageSize: 1000, chunkSize: 1024 * 1024 }); + await runWriteBenchmark({ numMessages: 100, messageSize: 1_000_000, chunkSize: 1024 * 1024 }); + await runWriteBenchmark({ numMessages: 1_000_000, messageSize: 1, chunkSize: 10 * 1024 * 1024 }); + await runWriteBenchmark({ numMessages: 100_000, messageSize: 1000, chunkSize: 10 * 1024 * 1024 }); + await runWriteBenchmark({ + numMessages: 100, + messageSize: 1_000_000, + chunkSize: 10 * 1024 * 1024, + }); +} + +async function runWriteBenchmark({ numMessages, messageSize, chunkSize, @@ -39,54 +113,49 @@ function addWriteBenchmark({ messageSize: number; chunkSize: number; }) { - return add( + const messageData = new Uint8Array(messageSize).fill(42); + const writable = new FakeMemoryWritable(2 * chunkSize); + await runBenchmark( `count=${numMessages.toLocaleString()} size=${messageSize.toLocaleString()} chunkSize=${chunkSize.toLocaleString()} (1 op ≈ ${( numMessages * messageSize ).toLocaleString()} bytes)`, async () => { - const messageData = new Uint8Array(messageSize).fill(42); - const writable = new FakeMemoryWritable(2 * chunkSize); - return async () => { - writable.reset(); - const writer = new McapWriter({ writable, chunkSize }); - await writer.start({ library: "", profile: "" }); - const channelId = await writer.registerChannel({ - schemaId: 0, - topic: "", - messageEncoding: "", - metadata: new Map([]), + writable.reset(); + const writer = new McapWriter({ writable, chunkSize }); + await writer.start({ library: "", profile: "" }); + const channelId = await writer.registerChannel({ + schemaId: 0, + topic: "", + messageEncoding: "", + metadata: new Map([]), + }); + for (let i = 0; i < numMessages; i++) { + await writer.addMessage({ + channelId, + sequence: i, + logTime: BigInt(i), + publishTime: BigInt(i), + data: messageData, }); - for (let i = 0; i < numMessages; i++) { - await writer.addMessage({ - channelId, - sequence: i, - logTime: BigInt(i), - publishTime: BigInt(i), - data: messageData, - }); - } - await writer.end(); - }; + } + await writer.end(); }, ); } -async function benchmarkWriter() { - await suite( - McapWriter.name, - addWriteBenchmark({ numMessages: 1_000_000, messageSize: 1, chunkSize: 1024 * 1024 }), - addWriteBenchmark({ numMessages: 100_000, messageSize: 1000, chunkSize: 1024 * 1024 }), - addWriteBenchmark({ numMessages: 100, messageSize: 1_000_000, chunkSize: 1024 * 1024 }), - addWriteBenchmark({ numMessages: 1_000_000, messageSize: 1, chunkSize: 10 * 1024 * 1024 }), - addWriteBenchmark({ numMessages: 100_000, messageSize: 1000, chunkSize: 10 * 1024 * 1024 }), - addWriteBenchmark({ numMessages: 100, messageSize: 1_000_000, chunkSize: 10 * 1024 * 1024 }), - cycle(), - complete(), - ); -} - -async function main() { - await benchmarkWriter(); +async function main(args: { suite?: string }) { + const { suite } = args; + if (suite == undefined || suite === "writer") { + console.log("Running 'writer' suite"); + await benchmarkWriter(); + } + if (suite == undefined || suite === "reader") { + console.log("Running 'reader' suite"); + await benchmarkReaders(); + } } -void main(); +program + .addOption(program.createOption("--suite ", "Name of suite to run")) + .action(main) + .parse(); diff --git a/typescript/benchmarks/package.json b/typescript/benchmarks/package.json index 925a49dce3..9121b5f574 100644 --- a/typescript/benchmarks/package.json +++ b/typescript/benchmarks/package.json @@ -17,8 +17,8 @@ "typecheck": "tsc -p tsconfig.json --noEmit", "lint:ci": "eslint --report-unused-disable-directives .", "lint": "eslint --report-unused-disable-directives --fix .", - "bench": "ts-node --files --project tsconfig.cjs.json index.ts", - "bench:debug": "NODE_OPTIONS='--inspect-brk' ts-node --files --project tsconfig.cjs.json index.ts" + "bench": "TS_NODE_FILES=true TS_NODE_PROJECT=tsconfig.cjs.json node --expose-gc -r 'ts-node/register' index.ts", + "bench:debug": "TS_NODE_FILES=true TS_NODE_PROJECT=tsconfig.cjs.json node --inspect-brk --expose-gc -r 'ts-node/register' index.ts" }, "devDependencies": { "@foxglove/eslint-plugin": "1.0.1", @@ -27,7 +27,7 @@ "@types/node": "18.13.0", "@typescript-eslint/eslint-plugin": "6.11.0", "@typescript-eslint/parser": "6.11.0", - "benny": "^3.7.1", + "commander": "12.1.0", "eslint": "8.54.0", "eslint-config-prettier": "9.0.0", "eslint-plugin-es": "4.1.0", diff --git a/typescript/core/package.json b/typescript/core/package.json index cf4b2b05e0..2153651369 100644 --- a/typescript/core/package.json +++ b/typescript/core/package.json @@ -1,6 +1,6 @@ { "name": "@mcap/core", - "version": "2.1.1", + "version": "2.1.2", "description": "MCAP file support in TypeScript", "license": "MIT", "repository": { diff --git a/typescript/core/src/parse.ts b/typescript/core/src/parse.ts index 1d9672fea8..7f2fe80285 100644 --- a/typescript/core/src/parse.ts +++ b/typescript/core/src/parse.ts @@ -62,10 +62,15 @@ export function parseRecord({ } if (!isKnownOpcode(opcode)) { + const data = new Uint8Array( + view.buffer, + view.byteOffset + headerReader.offset, + recordLengthNum, + ); const record: TypedMcapRecord = { type: "Unknown", opcode, - data: new Uint8Array(view.buffer, view.byteOffset + headerReader.offset, recordLengthNum), + data, }; return { record, usedBytes: recordEndOffset - startOffset }; } @@ -107,11 +112,10 @@ export function parseRecord({ throw new Error(`Schema data length ${dataLen} exceeds bounds of record`); } const data = new Uint8Array( - recordView.buffer.slice( - recordView.byteOffset + reader.offset, - recordView.byteOffset + reader.offset + dataLen, - ), - ); + recordView.buffer, + recordView.byteOffset + reader.offset, + dataLen, + ).slice(); reader.offset += dataLen; const record: TypedMcapRecord = { @@ -153,11 +157,10 @@ export function parseRecord({ const logTime = reader.uint64(); const publishTime = reader.uint64(); const data = new Uint8Array( - recordView.buffer.slice( - recordView.byteOffset + reader.offset, - recordView.byteOffset + recordView.byteLength, - ), - ); + recordView.buffer, + recordView.byteOffset + reader.offset, + recordView.byteLength - reader.offset, + ).slice(); const record: TypedMcapRecord = { type: "Message", channelId, @@ -180,11 +183,10 @@ export function parseRecord({ throw new Error("Chunk records length exceeds remaining record size"); } const records = new Uint8Array( - recordView.buffer.slice( - recordView.byteOffset + reader.offset, - recordView.byteOffset + reader.offset + recordByteLength, - ), - ); + recordView.buffer, + recordView.byteOffset + reader.offset, + recordByteLength, + ).slice(); const record: TypedMcapRecord = { type: "Chunk", messageStartTime: startTime, @@ -250,11 +252,10 @@ export function parseRecord({ throw new Error(`Attachment data length ${dataLen} exceeds bounds of record`); } const data = new Uint8Array( - recordView.buffer.slice( - recordView.byteOffset + reader.offset, - recordView.byteOffset + reader.offset + Number(dataLen), - ), - ); + recordView.buffer, + recordView.byteOffset + reader.offset, + Number(dataLen), + ).slice(); reader.offset += Number(dataLen); const crcLength = reader.offset; const expectedCrc = reader.uint32(); diff --git a/yarn.lock b/yarn.lock index 8d6b97d997..90c5ddef7e 100644 --- a/yarn.lock +++ b/yarn.lock @@ -189,50 +189,6 @@ __metadata: languageName: node linkType: hard -"@arrows/array@npm:^1.4.1": - version: 1.4.1 - resolution: "@arrows/array@npm:1.4.1" - dependencies: - "@arrows/composition": ^1.2.2 - checksum: 39de47a49709376d91360955665f5cc33ad6fce85125a5b1fde777bf963bd2d053cc77a587253a55e6f4241a75ad7db991aacc26eb36edb7a746d824eb8ebd8a - languageName: node - linkType: hard - -"@arrows/composition@npm:^1.0.0, @arrows/composition@npm:^1.2.2": - version: 1.2.2 - resolution: "@arrows/composition@npm:1.2.2" - checksum: 3219e9a4e220c9778d8919fef329608b9966667b61f26e403d368646ebc65d96b68abcb7a73621992baad678e444ceb36914f1f2db2d6502ddfe738e9230e737 - languageName: node - linkType: hard - -"@arrows/dispatch@npm:^1.0.2": - version: 1.0.3 - resolution: "@arrows/dispatch@npm:1.0.3" - dependencies: - "@arrows/composition": ^1.2.2 - checksum: 2bd0b1ad5345b056cd300b63eedf3a1b9f17e8f891a5b5d1e70e9a3d8c426ec05828c38cd437f742e75387fbc98b3082fef23f62fe97688b63d060376d50dcd9 - languageName: node - linkType: hard - -"@arrows/error@npm:^1.0.2": - version: 1.0.2 - resolution: "@arrows/error@npm:1.0.2" - checksum: 35ad67e8d2781879a22711f5c7ba3907d6772ff42b24abc8b94b5165414e802f6c207f2024f50508c8f40637465a91da268ebf321c0eef5aaf44fc3d4acc7a58 - languageName: node - linkType: hard - -"@arrows/multimethod@npm:^1.1.6": - version: 1.4.1 - resolution: "@arrows/multimethod@npm:1.4.1" - dependencies: - "@arrows/array": ^1.4.1 - "@arrows/composition": ^1.2.2 - "@arrows/error": ^1.0.2 - fast-deep-equal: ^3.1.3 - checksum: 2a3a6b62debb163448ce1e90c9a0508866e605895967a67ef3c65f5248e5e7318ae95a92d4a62aff0518eea63755cc0467deb3265c3c9b41e00a892802ae729a - languageName: node - linkType: hard - "@babel/code-frame@npm:^7.0.0, @babel/code-frame@npm:^7.10.4, @babel/code-frame@npm:^7.12.13, @babel/code-frame@npm:^7.16.0, @babel/code-frame@npm:^7.18.6, @babel/code-frame@npm:^7.21.4, @babel/code-frame@npm:^7.8.3": version: 7.21.4 resolution: "@babel/code-frame@npm:7.21.4" @@ -2832,7 +2788,7 @@ __metadata: "@types/node": 18.13.0 "@typescript-eslint/eslint-plugin": 6.11.0 "@typescript-eslint/parser": 6.11.0 - benny: ^3.7.1 + commander: 12.1.0 eslint: 8.54.0 eslint-config-prettier: 9.0.0 eslint-plugin-es: 4.1.0 @@ -5123,7 +5079,7 @@ __metadata: languageName: node linkType: hard -"ansi-escapes@npm:^4.2.1, ansi-escapes@npm:^4.3.0": +"ansi-escapes@npm:^4.2.1": version: 4.3.2 resolution: "ansi-escapes@npm:4.3.2" dependencies: @@ -5374,13 +5330,6 @@ __metadata: languageName: node linkType: hard -"astral-regex@npm:^2.0.0": - version: 2.0.0 - resolution: "astral-regex@npm:2.0.0" - checksum: 876231688c66400473ba505731df37ea436e574dd524520294cc3bbc54ea40334865e01fa0d074d74d036ee874ee7e62f486ea38bc421ee8e6a871c06f011766 - languageName: node - linkType: hard - "asynciterator.prototype@npm:^1.0.0": version: 1.0.0 resolution: "asynciterator.prototype@npm:1.0.0" @@ -5623,33 +5572,6 @@ __metadata: languageName: node linkType: hard -"benchmark@npm:^2.1.4": - version: 2.1.4 - resolution: "benchmark@npm:2.1.4" - dependencies: - lodash: ^4.17.4 - platform: ^1.3.3 - checksum: aa466561d4f2b0a2419a3069b8f90fd35ffacf26849697eea9de525ecfbd10b44da11070cc51c88d772076db8cb2415641b493de7d6c024fdf8551019c6fcf1c - languageName: node - linkType: hard - -"benny@npm:^3.7.1": - version: 3.7.1 - resolution: "benny@npm:3.7.1" - dependencies: - "@arrows/composition": ^1.0.0 - "@arrows/dispatch": ^1.0.2 - "@arrows/multimethod": ^1.1.6 - benchmark: ^2.1.4 - common-tags: ^1.8.0 - fs-extra: ^10.0.0 - json2csv: ^5.0.6 - kleur: ^4.1.4 - log-update: ^4.0.0 - checksum: 8dcca91afb6e97b986a16fc73a2a12b2d51c306dc1e9fca6ace988b3ca26405dffcb85309083a449d27cfab440d8164b5cff3a0deba034879da401305412af34 - languageName: node - linkType: hard - "big-integer@npm:^1.6.44": version: 1.6.51 resolution: "big-integer@npm:1.6.51" @@ -6162,15 +6084,6 @@ __metadata: languageName: node linkType: hard -"cli-cursor@npm:^3.1.0": - version: 3.1.0 - resolution: "cli-cursor@npm:3.1.0" - dependencies: - restore-cursor: ^3.1.0 - checksum: 2692784c6cd2fd85cfdbd11f53aea73a463a6d64a77c3e098b2b4697a20443f430c220629e1ca3b195ea5ac4a97a74c2ee411f3807abf6df2b66211fec0c0a29 - languageName: node - linkType: hard - "cli-table3@npm:^0.6.2": version: 0.6.3 resolution: "cli-table3@npm:0.6.3" @@ -6326,6 +6239,13 @@ __metadata: languageName: node linkType: hard +"commander@npm:12.1.0": + version: 12.1.0 + resolution: "commander@npm:12.1.0" + checksum: 68e9818b00fc1ed9cdab9eb16905551c2b768a317ae69a5e3c43924c2b20ac9bb65b27e1cab36aeda7b6496376d4da908996ba2c0b5d79463e0fb1e77935d514 + languageName: node + linkType: hard + "commander@npm:^2.20.0": version: 2.20.3 resolution: "commander@npm:2.20.3" @@ -6340,13 +6260,6 @@ __metadata: languageName: node linkType: hard -"commander@npm:^6.1.0": - version: 6.2.1 - resolution: "commander@npm:6.2.1" - checksum: d7090410c0de6bc5c67d3ca41c41760d6d268f3c799e530aafb73b7437d1826bbf0d2a3edac33f8b57cc9887b4a986dce307fa5557e109be40eadb7c43b21742 - languageName: node - linkType: hard - "commander@npm:^7.2.0": version: 7.2.0 resolution: "commander@npm:7.2.0" @@ -6374,13 +6287,6 @@ __metadata: languageName: node linkType: hard -"common-tags@npm:^1.8.0": - version: 1.8.2 - resolution: "common-tags@npm:1.8.2" - checksum: 767a6255a84bbc47df49a60ab583053bb29a7d9687066a18500a516188a062c4e4cd52de341f22de0b07062e699b1b8fe3cfa1cb55b241cb9301aeb4f45b4dff - languageName: node - linkType: hard - "commondir@npm:^1.0.1": version: 1.0.1 resolution: "commondir@npm:1.0.1" @@ -8527,7 +8433,7 @@ __metadata: languageName: node linkType: hard -"fs-extra@npm:^10.0.0, fs-extra@npm:^10.1.0": +"fs-extra@npm:^10.1.0": version: 10.1.0 resolution: "fs-extra@npm:10.1.0" dependencies: @@ -10735,19 +10641,6 @@ __metadata: languageName: node linkType: hard -"json2csv@npm:^5.0.6": - version: 5.0.7 - resolution: "json2csv@npm:5.0.7" - dependencies: - commander: ^6.1.0 - jsonparse: ^1.3.1 - lodash.get: ^4.4.2 - bin: - json2csv: bin/json2csv.js - checksum: 81b511e4f5abba1dcda90593c193d15e5f05f1def91377b6289536e31fdb629889da6a2b4612b9ff699116a29b1758d20c0d71f7921fcfb09863da5b2d883139 - languageName: node - linkType: hard - "json5@npm:^1.0.2": version: 1.0.2 resolution: "json5@npm:1.0.2" @@ -10795,13 +10688,6 @@ __metadata: languageName: node linkType: hard -"jsonparse@npm:^1.3.1": - version: 1.3.1 - resolution: "jsonparse@npm:1.3.1" - checksum: 6514a7be4674ebf407afca0eda3ba284b69b07f9958a8d3113ef1005f7ec610860c312be067e450c569aab8b89635e332cee3696789c750692bb60daba627f4d - languageName: node - linkType: hard - "jsx-ast-utils@npm:^2.4.1 || ^3.0.0": version: 3.3.3 resolution: "jsx-ast-utils@npm:3.3.3" @@ -10858,13 +10744,6 @@ __metadata: languageName: node linkType: hard -"kleur@npm:^4.1.4": - version: 4.1.5 - resolution: "kleur@npm:4.1.5" - checksum: 1dc476e32741acf0b1b5b0627ffd0d722e342c1b0da14de3e8ae97821327ca08f9fb944542fb3c126d90ac5f27f9d804edbe7c585bf7d12ef495d115e0f22c12 - languageName: node - linkType: hard - "klona@npm:^2.0.6": version: 2.0.6 resolution: "klona@npm:2.0.6" @@ -11012,13 +10891,6 @@ __metadata: languageName: node linkType: hard -"lodash.get@npm:^4.4.2": - version: 4.4.2 - resolution: "lodash.get@npm:4.4.2" - checksum: e403047ddb03181c9d0e92df9556570e2b67e0f0a930fcbbbd779370972368f5568e914f913e93f3b08f6d492abc71e14d4e9b7a18916c31fa04bd2306efe545 - languageName: node - linkType: hard - "lodash.kebabcase@npm:4.1.1": version: 4.1.1 resolution: "lodash.kebabcase@npm:4.1.1" @@ -11061,25 +10933,13 @@ __metadata: languageName: node linkType: hard -"lodash@npm:4.17.21, lodash@npm:^4.17.19, lodash@npm:^4.17.20, lodash@npm:^4.17.21, lodash@npm:^4.17.4": +"lodash@npm:4.17.21, lodash@npm:^4.17.19, lodash@npm:^4.17.20, lodash@npm:^4.17.21": version: 4.17.21 resolution: "lodash@npm:4.17.21" checksum: eb835a2e51d381e561e508ce932ea50a8e5a68f4ebdd771ea240d3048244a8d13658acbd502cd4829768c56f2e16bdd4340b9ea141297d472517b83868e677f7 languageName: node linkType: hard -"log-update@npm:^4.0.0": - version: 4.0.0 - resolution: "log-update@npm:4.0.0" - dependencies: - ansi-escapes: ^4.3.0 - cli-cursor: ^3.1.0 - slice-ansi: ^4.0.0 - wrap-ansi: ^6.2.0 - checksum: ae2f85bbabc1906034154fb7d4c4477c79b3e703d22d78adee8b3862fa913942772e7fa11713e3d96fb46de4e3cabefbf5d0a544344f03b58d3c4bff52aa9eb2 - languageName: node - linkType: hard - "long@npm:^5.0.0": version: 5.2.1 resolution: "long@npm:5.2.1" @@ -11883,7 +11743,7 @@ __metadata: languageName: node linkType: hard -"onetime@npm:^5.1.0, onetime@npm:^5.1.2": +"onetime@npm:^5.1.2": version: 5.1.2 resolution: "onetime@npm:5.1.2" dependencies: @@ -12291,13 +12151,6 @@ __metadata: languageName: node linkType: hard -"platform@npm:^1.3.3": - version: 1.3.6 - resolution: "platform@npm:1.3.6" - checksum: 6f472a09c61d418c7e26c1c16d0bdc029549d512dbec6526216a1e59ec68100d07007d0097dcba69dddad883d6f2a83361b4bdfe0094a3d9a2af24158643d85e - languageName: node - linkType: hard - "postcss-calc@npm:^8.2.3": version: 8.2.4 resolution: "postcss-calc@npm:8.2.4" @@ -13656,16 +13509,6 @@ __metadata: languageName: node linkType: hard -"restore-cursor@npm:^3.1.0": - version: 3.1.0 - resolution: "restore-cursor@npm:3.1.0" - dependencies: - onetime: ^5.1.0 - signal-exit: ^3.0.2 - checksum: f877dd8741796b909f2a82454ec111afb84eb45890eb49ac947d87991379406b3b83ff9673a46012fca0d7844bb989f45cc5b788254cf1a39b6b5a9659de0630 - languageName: node - linkType: hard - "retry@npm:^0.12.0": version: 0.12.0 resolution: "retry@npm:0.12.0" @@ -14183,17 +14026,6 @@ __metadata: languageName: node linkType: hard -"slice-ansi@npm:^4.0.0": - version: 4.0.0 - resolution: "slice-ansi@npm:4.0.0" - dependencies: - ansi-styles: ^4.0.0 - astral-regex: ^2.0.0 - is-fullwidth-code-point: ^3.0.0 - checksum: 4a82d7f085b0e1b070e004941ada3c40d3818563ac44766cca4ceadd2080427d337554f9f99a13aaeb3b4a94d9964d9466c807b3d7b7541d1ec37ee32d308756 - languageName: node - linkType: hard - "smart-buffer@npm:^4.2.0": version: 4.2.0 resolution: "smart-buffer@npm:4.2.0" @@ -15948,17 +15780,6 @@ __metadata: languageName: node linkType: hard -"wrap-ansi@npm:^6.2.0": - version: 6.2.0 - resolution: "wrap-ansi@npm:6.2.0" - dependencies: - ansi-styles: ^4.0.0 - string-width: ^4.1.0 - strip-ansi: ^6.0.0 - checksum: 6cd96a410161ff617b63581a08376f0cb9162375adeb7956e10c8cd397821f7eb2a6de24eb22a0b28401300bf228c86e50617cd568209b5f6775b93c97d2fe3a - languageName: node - linkType: hard - "wrap-ansi@npm:^7.0.0": version: 7.0.0 resolution: "wrap-ansi@npm:7.0.0" From 95e2b4462f2ea39a6bbaaeaf2af43192caeb12f3 Mon Sep 17 00:00:00 2001 From: Bryan Fox <39674+bryfox@users.noreply.github.com> Date: Tue, 9 Jul 2024 08:51:40 -0400 Subject: [PATCH 18/44] Fix table output with wide columns (#1191) Fix: table output from list commands supports wide columns such as large amounts of metadata Previously, listing the metadata for an mcap file with many (>64kb) key/value pairs failed silently. The Scanner used in the table formatter has a buffer limit of 64kb, and the error result was being ignored. This removes the scanner entirely from most table output. It was introduced to trim leading whitespace from lines, but the formatter accepts other options to make this happen. I've given the info command its own formatter, which I think is appropriate because it's formatting data differently, even including tabs in its row data. The scanner's error result is now checked, though we shouldn't see this in practice for `mcap info`. This also makes a couple of copy fixes to the inline help for the metadata commands. Fixes #1189. --- go/cli/mcap/cmd/info.go | 30 ++++++++++++++++++++++++++++-- go/cli/mcap/cmd/metadata.go | 4 ++-- go/cli/mcap/utils/utils.go | 14 ++++---------- 3 files changed, 34 insertions(+), 14 deletions(-) diff --git a/go/cli/mcap/cmd/info.go b/go/cli/mcap/cmd/info.go index e78e49aa62..5051da7d56 100644 --- a/go/cli/mcap/cmd/info.go +++ b/go/cli/mcap/cmd/info.go @@ -1,6 +1,7 @@ package cmd import ( + "bufio" "bytes" "context" "fmt" @@ -13,6 +14,7 @@ import ( "github.com/foxglove/mcap/go/cli/mcap/utils" "github.com/foxglove/mcap/go/mcap" + "github.com/olekukonko/tablewriter" "github.com/spf13/cobra" ) @@ -89,7 +91,9 @@ func printInfo(w io.Writer, info *mcap.Info) error { header = addRow(header, "end:", "%s", decimalTime(endtime)) } } - utils.FormatTable(buf, header) + if err := printSummaryRows(buf, header); err != nil { + return err + } if len(info.ChunkIndexes) > 0 { compressionFormatStats := make(map[mcap.CompressionFormat]struct { count int @@ -166,7 +170,9 @@ func printInfo(w io.Writer, info *mcap.Info) error { } rows = append(rows, row) } - utils.FormatTable(buf, rows) + if err := printSummaryRows(buf, rows); err != nil { + return err + } if info.Statistics != nil { fmt.Fprintf(buf, "attachments: %d\n", info.Statistics.AttachmentCount) fmt.Fprintf(buf, "metadata: %d\n", info.Statistics.MetadataCount) @@ -178,6 +184,26 @@ func printInfo(w io.Writer, info *mcap.Info) error { return err } +// Similar to utils.FormatTable, but optimized for 'expanded' display of nested data. +func printSummaryRows(w io.Writer, rows [][]string) error { + buf := &bytes.Buffer{} + tw := tablewriter.NewWriter(buf) + tw.SetBorder(false) + tw.SetAutoWrapText(false) + tw.SetAlignment(tablewriter.ALIGN_LEFT) + tw.SetHeaderAlignment(tablewriter.ALIGN_LEFT) + tw.SetColumnSeparator("") + tw.AppendBulk(rows) + tw.Render() + // This tablewriter puts a leading space on the lines for some reason, so + // remove it. + scanner := bufio.NewScanner(buf) + for scanner.Scan() { + fmt.Fprintln(w, strings.TrimLeft(scanner.Text(), " ")) + } + return scanner.Err() +} + var infoCmd = &cobra.Command{ Use: "info", Short: "Report statistics about an MCAP file", diff --git a/go/cli/mcap/cmd/metadata.go b/go/cli/mcap/cmd/metadata.go index f46a547183..b3475b6323 100644 --- a/go/cli/mcap/cmd/metadata.go +++ b/go/cli/mcap/cmd/metadata.go @@ -134,7 +134,7 @@ var addMetadataCmd = &cobra.Command{ var getMetadataCmd = &cobra.Command{ Use: "metadata", - Short: "get metadata by name", + Short: "Get metadata by name", Run: func(_ *cobra.Command, args []string) { ctx := context.Background() if len(args) != 1 { @@ -214,7 +214,7 @@ func init() { } getCmd.AddCommand(getMetadataCmd) - getMetadataCmd.PersistentFlags().StringVarP(&getMetadataName, "name", "n", "", "name of metadata record to create") + getMetadataCmd.PersistentFlags().StringVarP(&getMetadataName, "name", "n", "", "name of metadata record to get") err = getMetadataCmd.MarkPersistentFlagRequired("name") if err != nil { die("failed to mark --name flag as required: %s", err) diff --git a/go/cli/mcap/utils/utils.go b/go/cli/mcap/utils/utils.go index 0af2ad0ca3..be9853223c 100644 --- a/go/cli/mcap/utils/utils.go +++ b/go/cli/mcap/utils/utils.go @@ -1,7 +1,6 @@ package utils import ( - "bufio" "bytes" "context" "encoding/json" @@ -9,7 +8,6 @@ import ( "io" "os" "regexp" - "strings" "time" "cloud.google.com/go/storage" @@ -107,21 +105,17 @@ func WithReader(ctx context.Context, filename string, f func(remote bool, rs io. } func FormatTable(w io.Writer, rows [][]string) { - buf := &bytes.Buffer{} - tw := tablewriter.NewWriter(buf) + tw := tablewriter.NewWriter(w) tw.SetBorder(false) tw.SetAutoWrapText(false) tw.SetAlignment(tablewriter.ALIGN_LEFT) tw.SetHeaderAlignment(tablewriter.ALIGN_LEFT) tw.SetColumnSeparator("") + tw.SetTablePadding("\t") + tw.SetNoWhiteSpace(true) + tw.AppendBulk(rows) tw.Render() - // This tablewriter puts a leading space on the lines for some reason, so - // remove it. - scanner := bufio.NewScanner(buf) - for scanner.Scan() { - fmt.Fprintln(w, strings.TrimLeft(scanner.Text(), " ")) - } } func Keys[T any](m map[string]T) []string { From 05505c776bb0e707c8e561e40b0efac501b45ac1 Mon Sep 17 00:00:00 2001 From: Filippo Brizzi Date: Tue, 16 Jul 2024 19:58:48 +0200 Subject: [PATCH 19/44] add page with guide to install mcap with cmake (#1190) ### Changelog Add a page to `website` with the guide on how to install MCAP with Cmake. ### Description This follows conversation from https://foxglove.slack.com/archives/C02H1JXG3C3/p1719462475680899 --------- Co-authored-by: Devon Rueckner --- cpp/README.md | 9 +++++++++ website/docs/guides/cpp/cmake.md | 9 +++++++++ 2 files changed, 18 insertions(+) create mode 100644 website/docs/guides/cpp/cmake.md diff --git a/cpp/README.md b/cpp/README.md index fb09b4dbb3..a986e72cad 100644 --- a/cpp/README.md +++ b/cpp/README.md @@ -86,9 +86,18 @@ following dependencies: If your project does not need `lz4` or `zstd` support, you can optionally disable these by defining `MCAP_COMPRESSION_NO_LZ4` or `MCAP_COMPRESSION_NO_ZSTD` respectively. +### Conan + To simplify installation of dependencies, the [Conan](https://conan.io/) package manager can be used with the included [conanfile.py](https://github.com/foxglove/mcap/blob/main/cpp/mcap/conanfile.py). + +### CMake + +For using MCAP with CMake, the third-party [olympus-robotics/mcap_builder](https://github.com/olympus-robotics/mcap_builder) repository provides a helpful wrapper. + +### Alternatives + If you use an alternative approach, such as CMake's FetchContent or directly vendoring the dependencies, make sure you use versions equal or greater than the versions listed above. diff --git a/website/docs/guides/cpp/cmake.md b/website/docs/guides/cpp/cmake.md new file mode 100644 index 0000000000..af42e95d7b --- /dev/null +++ b/website/docs/guides/cpp/cmake.md @@ -0,0 +1,9 @@ +--- +description: Build and use MCAP C++ library with CMake. +--- + +# Build MCAP with CMake + +If you want to add MCAP to your C++ project that uses CMake, the third-party [olympus-robotics/mcap_builder](https://github.com/olympus-robotics/mcap_builder) repository provides a helpful wrapper. + +The readme file in that repository provides the steps and the context needed. From 5add9e25d7f6f8a24fc48f6d9fa4d624393d923d Mon Sep 17 00:00:00 2001 From: kyle-basis Date: Wed, 24 Jul 2024 09:09:33 -0700 Subject: [PATCH 20/44] Remove dead members in McapReader (#1199) ### Changelog Remove unused `startTime_` and `endTime_` members of `McapReader` that were initialized and never used. ### Docs None ### Description This is fairly self evident - if the code compiles it should be the correct change. --- cpp/mcap/include/mcap/reader.hpp | 2 -- cpp/mcap/include/mcap/reader.inl | 2 -- 2 files changed, 4 deletions(-) diff --git a/cpp/mcap/include/mcap/reader.hpp b/cpp/mcap/include/mcap/reader.hpp index 907f9450e9..4bdefc65c6 100644 --- a/cpp/mcap/include/mcap/reader.hpp +++ b/cpp/mcap/include/mcap/reader.hpp @@ -493,8 +493,6 @@ class MCAP_PUBLIC McapReader final { std::unordered_map channels_; ByteOffset dataStart_ = 0; ByteOffset dataEnd_ = EndOffset; - Timestamp startTime_ = 0; - Timestamp endTime_ = 0; bool parsedSummary_ = false; void reset_(); diff --git a/cpp/mcap/include/mcap/reader.inl b/cpp/mcap/include/mcap/reader.inl index 8f3ab8b815..ec1343de9a 100644 --- a/cpp/mcap/include/mcap/reader.inl +++ b/cpp/mcap/include/mcap/reader.inl @@ -363,8 +363,6 @@ void McapReader::reset_() { channels_.clear(); dataStart_ = 0; dataEnd_ = EndOffset; - startTime_ = 0; - endTime_ = 0; parsedSummary_ = false; } From a05437584d49c506f77004569d6f4908ff39efec Mon Sep 17 00:00:00 2001 From: Bryan Fox <39674+bryfox@users.noreply.github.com> Date: Fri, 26 Jul 2024 11:10:10 -0400 Subject: [PATCH 21/44] Specify patch version and sync (#1192) This updates the go workspace to specify a version of go, and syncs the workspace. When I used the existing workspace locally, go commands failed, because it could not install a toolchain for version "1.22". These (omitted patch) versions are development versions, which are removed once out of development. --- go/cli/mcap/go.sum | 6 ---- go/conformance/test-read-conformance/go.mod | 16 +++++---- go/conformance/test-read-conformance/go.sum | 22 ++++++------ go/conformance/test-write-conformance/go.mod | 12 ++++--- go/conformance/test-write-conformance/go.sum | 22 ++++++------ go/go.work | 2 +- go/mcap/go.mod | 12 ++++--- go/mcap/go.sum | 23 +++++------- go/ros/go.mod | 11 +++--- go/ros/go.sum | 37 +++++--------------- 10 files changed, 70 insertions(+), 93 deletions(-) diff --git a/go/cli/mcap/go.sum b/go/cli/mcap/go.sum index f3afefde15..fc12da2ecb 100644 --- a/go/cli/mcap/go.sum +++ b/go/cli/mcap/go.sum @@ -100,8 +100,6 @@ github.com/envoyproxy/go-control-plane v0.10.2-0.20220325020618-49ff273808a1/go. github.com/envoyproxy/protoc-gen-validate v0.1.0/go.mod h1:iSmxcyjqTsJpI2R4NaDN7+kN2VEUnK/pcBlmesArF7c= github.com/fatih/color v1.13.0 h1:8LOYc1KYPPmyKMuN8QV2DNRWNbLo6LZ0iLs8+mlH53w= github.com/fatih/color v1.13.0/go.mod h1:kLAiJbzzSOZDVNGyDpeOxJ47H46qBXwg5ILebYFFOfk= -github.com/foxglove/go-rosbag v0.0.5 h1:UCkYdiBGagpgAql8cNO0d8wX8QZMfGUte0+8aWB2MM4= -github.com/foxglove/go-rosbag v0.0.5/go.mod h1:Kz3doYZfPO6OIawx4tFm9MU9COkuzcYaI963psJeLrA= github.com/foxglove/go-rosbag v0.0.6 h1:LcWr1LqdS1NxWO4+mbPfo7d1jpL3gybqRmX1abD8eAw= github.com/foxglove/go-rosbag v0.0.6/go.mod h1:Kz3doYZfPO6OIawx4tFm9MU9COkuzcYaI963psJeLrA= github.com/foxglove/mcap/go/mcap v0.4.0 h1:jsDZZ6qmMKa174EE8Tw0hxeMUdgjz8emTlN8+6FEnXE= @@ -250,8 +248,6 @@ github.com/pelletier/go-toml v1.9.5 h1:4yBQzkHv+7BHq2PQUZF3Mx0IYxG7LsP222s7Agd3v github.com/pelletier/go-toml v1.9.5/go.mod h1:u1nR/EPcESfeI/szUZKdtJ0xRNbUoANCkoOuaOx1Y+c= github.com/pelletier/go-toml/v2 v2.0.2 h1:+jQXlF3scKIcSEKkdHzXhCTDLPFi5r1wnK6yPS+49Gw= github.com/pelletier/go-toml/v2 v2.0.2/go.mod h1:MovirKjgVRESsAvNZlAjtFwV867yGuwRkXbG66OzopI= -github.com/pierrec/lz4/v4 v4.1.18 h1:xaKrnTkyoqfh1YItXl56+6KJNVYWlEEPuAQW9xsplYQ= -github.com/pierrec/lz4/v4 v4.1.18/go.mod h1:gZWDp/Ze/IJXGXf23ltt2EXimqmTUXEy0GFuRQyBid4= github.com/pierrec/lz4/v4 v4.1.21 h1:yOVMLb6qSIDP67pl/5F7RepeKYu/VmTyEXvuMI5d9mQ= github.com/pierrec/lz4/v4 v4.1.21/go.mod h1:gZWDp/Ze/IJXGXf23ltt2EXimqmTUXEy0GFuRQyBid4= github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= @@ -288,8 +284,6 @@ github.com/stretchr/testify v1.5.1/go.mod h1:5W2xD1RspED5o8YsWQXVCued0rvSQ+mT+I5 github.com/stretchr/testify v1.6.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= github.com/stretchr/testify v1.7.2/go.mod h1:R6va5+xMeoiuVRoj+gSkQ7d3FALtqAAGI1FQKckRals= -github.com/stretchr/testify v1.8.4 h1:CcVxjf3Q8PM0mHUKJCdn+eZZtm5yQwehR5yeSVQQcUk= -github.com/stretchr/testify v1.8.4/go.mod h1:sz/lmYIOXD/1dqDmKjjqLyZ2RngseejIcXlSw2iwfAo= github.com/stretchr/testify v1.9.0 h1:HtqpIVDClZ4nwg75+f6Lvsy/wHu+3BoSGCbBAcpTsTg= github.com/stretchr/testify v1.9.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY= github.com/subosito/gotenv v1.4.0 h1:yAzM1+SmVcz5R4tXGsNMu1jUl2aOJXoiWUCEwwnGrvs= diff --git a/go/conformance/test-read-conformance/go.mod b/go/conformance/test-read-conformance/go.mod index bafb30161b..c26a531003 100644 --- a/go/conformance/test-read-conformance/go.mod +++ b/go/conformance/test-read-conformance/go.mod @@ -2,13 +2,17 @@ module github.com/foxglove/mcap/go/conformance/test-read-conformance go 1.18 -require github.com/foxglove/mcap/go/mcap v0.0.0-20220328132551-ffb9c0b0ebdc +require ( + github.com/foxglove/mcap/go/mcap v0.0.0-20220328132551-ffb9c0b0ebdc + github.com/stretchr/testify v1.9.0 +) require ( - github.com/davecgh/go-spew v1.1.0 // indirect - github.com/klauspost/compress v1.14.1 // indirect - github.com/pierrec/lz4/v4 v4.1.12 // indirect + github.com/davecgh/go-spew v1.1.1 // indirect + github.com/klauspost/compress v1.16.7 // indirect + github.com/kr/pretty v0.3.0 // indirect + github.com/pierrec/lz4/v4 v4.1.21 // indirect github.com/pmezard/go-difflib v1.0.0 // indirect - github.com/stretchr/testify v1.7.0 // indirect - gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c // indirect + gopkg.in/check.v1 v1.0.0-20180628173108-788fd7840127 // indirect + gopkg.in/yaml.v3 v3.0.1 // indirect ) diff --git a/go/conformance/test-read-conformance/go.sum b/go/conformance/test-read-conformance/go.sum index 9620dd0980..5224437c70 100644 --- a/go/conformance/test-read-conformance/go.sum +++ b/go/conformance/test-read-conformance/go.sum @@ -1,15 +1,13 @@ -github.com/davecgh/go-spew v1.1.0 h1:ZDRjVQ15GmhC3fiQ8ni8+OwkZQO4DARzQgrnXU1Liz8= -github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= -github.com/klauspost/compress v1.14.1 h1:hLQYb23E8/fO+1u53d02A97a8UnsddcvYzq4ERRU4ds= -github.com/klauspost/compress v1.14.1/go.mod h1:/3/Vjq9QcHkK5uEr5lBEmyoZ1iFhe47etQ6QUkpK6sk= -github.com/pierrec/lz4/v4 v4.1.12 h1:44l88ehTZAUGW4VlO1QC4zkilL99M6Y9MXNwEs0uzP8= -github.com/pierrec/lz4/v4 v4.1.12/go.mod h1:gZWDp/Ze/IJXGXf23ltt2EXimqmTUXEy0GFuRQyBid4= +github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= +github.com/foxglove/mcap/go/mcap v0.0.0-20220328132551-ffb9c0b0ebdc h1:v4dm5b/Z412h6gPY8kwixeVtFRiixK4KIY7yV90p1T4= +github.com/klauspost/compress v1.16.7 h1:2mk3MPGNzKyxErAw8YaohYh69+pa4sIQSC0fPGCFR9I= +github.com/kr/pretty v0.3.0 h1:WgNl7dwNpEZ6jJ9k1snq4pZsg7DOEN8hP9Xw0Tsjwk0= +github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY= +github.com/pierrec/lz4/v4 v4.1.21 h1:yOVMLb6qSIDP67pl/5F7RepeKYu/VmTyEXvuMI5d9mQ= github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= -github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= -github.com/stretchr/testify v1.7.0 h1:nwc3DEeHmmLAfoZucVR881uASk0Mfjw8xYJ99tb5CcY= -github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= -gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405 h1:yhCVgyC4o1eVCa2tZl7eS0r+SDo693bJlVdllGtEeKM= +github.com/rogpeppe/go-internal v1.6.1 h1:/FiVV8dS/e+YqF2JvO3yXRFbBLTIuSDkuC7aBOAvL+k= +github.com/stretchr/testify v1.9.0 h1:HtqpIVDClZ4nwg75+f6Lvsy/wHu+3BoSGCbBAcpTsTg= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= -gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c h1:dUUwHk2QECo/6vqA44rthZ8ie2QXMNeKRTHCNY2nXvo= -gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= +gopkg.in/check.v1 v1.0.0-20180628173108-788fd7840127 h1:qIbj1fsPNlZgppZ+VLlY7N33q108Sa+fhmuc+sWQYwY= +gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= diff --git a/go/conformance/test-write-conformance/go.mod b/go/conformance/test-write-conformance/go.mod index 12d48f373e..81200a7908 100644 --- a/go/conformance/test-write-conformance/go.mod +++ b/go/conformance/test-write-conformance/go.mod @@ -4,13 +4,15 @@ go 1.18 require ( github.com/foxglove/mcap/go/mcap v0.0.0-20220328132551-ffb9c0b0ebdc - github.com/stretchr/testify v1.7.0 + github.com/stretchr/testify v1.9.0 ) require ( - github.com/davecgh/go-spew v1.1.0 // indirect - github.com/klauspost/compress v1.14.1 // indirect - github.com/pierrec/lz4/v4 v4.1.12 // indirect + github.com/davecgh/go-spew v1.1.1 // indirect + github.com/klauspost/compress v1.16.7 // indirect + github.com/kr/pretty v0.3.0 // indirect + github.com/pierrec/lz4/v4 v4.1.21 // indirect github.com/pmezard/go-difflib v1.0.0 // indirect - gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c // indirect + gopkg.in/check.v1 v1.0.0-20180628173108-788fd7840127 // indirect + gopkg.in/yaml.v3 v3.0.1 // indirect ) diff --git a/go/conformance/test-write-conformance/go.sum b/go/conformance/test-write-conformance/go.sum index 9620dd0980..5224437c70 100644 --- a/go/conformance/test-write-conformance/go.sum +++ b/go/conformance/test-write-conformance/go.sum @@ -1,15 +1,13 @@ -github.com/davecgh/go-spew v1.1.0 h1:ZDRjVQ15GmhC3fiQ8ni8+OwkZQO4DARzQgrnXU1Liz8= -github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= -github.com/klauspost/compress v1.14.1 h1:hLQYb23E8/fO+1u53d02A97a8UnsddcvYzq4ERRU4ds= -github.com/klauspost/compress v1.14.1/go.mod h1:/3/Vjq9QcHkK5uEr5lBEmyoZ1iFhe47etQ6QUkpK6sk= -github.com/pierrec/lz4/v4 v4.1.12 h1:44l88ehTZAUGW4VlO1QC4zkilL99M6Y9MXNwEs0uzP8= -github.com/pierrec/lz4/v4 v4.1.12/go.mod h1:gZWDp/Ze/IJXGXf23ltt2EXimqmTUXEy0GFuRQyBid4= +github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= +github.com/foxglove/mcap/go/mcap v0.0.0-20220328132551-ffb9c0b0ebdc h1:v4dm5b/Z412h6gPY8kwixeVtFRiixK4KIY7yV90p1T4= +github.com/klauspost/compress v1.16.7 h1:2mk3MPGNzKyxErAw8YaohYh69+pa4sIQSC0fPGCFR9I= +github.com/kr/pretty v0.3.0 h1:WgNl7dwNpEZ6jJ9k1snq4pZsg7DOEN8hP9Xw0Tsjwk0= +github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY= +github.com/pierrec/lz4/v4 v4.1.21 h1:yOVMLb6qSIDP67pl/5F7RepeKYu/VmTyEXvuMI5d9mQ= github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= -github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= -github.com/stretchr/testify v1.7.0 h1:nwc3DEeHmmLAfoZucVR881uASk0Mfjw8xYJ99tb5CcY= -github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= -gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405 h1:yhCVgyC4o1eVCa2tZl7eS0r+SDo693bJlVdllGtEeKM= +github.com/rogpeppe/go-internal v1.6.1 h1:/FiVV8dS/e+YqF2JvO3yXRFbBLTIuSDkuC7aBOAvL+k= +github.com/stretchr/testify v1.9.0 h1:HtqpIVDClZ4nwg75+f6Lvsy/wHu+3BoSGCbBAcpTsTg= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= -gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c h1:dUUwHk2QECo/6vqA44rthZ8ie2QXMNeKRTHCNY2nXvo= -gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= +gopkg.in/check.v1 v1.0.0-20180628173108-788fd7840127 h1:qIbj1fsPNlZgppZ+VLlY7N33q108Sa+fhmuc+sWQYwY= +gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= diff --git a/go/go.work b/go/go.work index d4baeee12a..7326675cdf 100644 --- a/go/go.work +++ b/go/go.work @@ -1,4 +1,4 @@ -go 1.22 +go 1.22.5 use ( ./cli/mcap diff --git a/go/mcap/go.mod b/go/mcap/go.mod index 0d508af31e..d4ad84fc17 100644 --- a/go/mcap/go.mod +++ b/go/mcap/go.mod @@ -3,13 +3,15 @@ module github.com/foxglove/mcap/go/mcap go 1.18 require ( - github.com/klauspost/compress v1.15.12 - github.com/pierrec/lz4/v4 v4.1.12 - github.com/stretchr/testify v1.7.0 + github.com/klauspost/compress v1.16.7 + github.com/pierrec/lz4/v4 v4.1.21 + github.com/stretchr/testify v1.9.0 ) require ( - github.com/davecgh/go-spew v1.1.0 // indirect + github.com/davecgh/go-spew v1.1.1 // indirect + github.com/kr/pretty v0.3.0 // indirect github.com/pmezard/go-difflib v1.0.0 // indirect - gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c // indirect + gopkg.in/check.v1 v1.0.0-20180628173108-788fd7840127 // indirect + gopkg.in/yaml.v3 v3.0.1 // indirect ) diff --git a/go/mcap/go.sum b/go/mcap/go.sum index 6097f86018..9b52a5a7c1 100644 --- a/go/mcap/go.sum +++ b/go/mcap/go.sum @@ -1,17 +1,12 @@ -github.com/davecgh/go-spew v1.1.0 h1:ZDRjVQ15GmhC3fiQ8ni8+OwkZQO4DARzQgrnXU1Liz8= -github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= -github.com/klauspost/compress v1.14.1 h1:hLQYb23E8/fO+1u53d02A97a8UnsddcvYzq4ERRU4ds= -github.com/klauspost/compress v1.14.1/go.mod h1:/3/Vjq9QcHkK5uEr5lBEmyoZ1iFhe47etQ6QUkpK6sk= -github.com/klauspost/compress v1.15.12 h1:YClS/PImqYbn+UILDnqxQCZ3RehC9N318SU3kElDUEM= -github.com/klauspost/compress v1.15.12/go.mod h1:QPwzmACJjUTFsnSHH934V6woptycfrDDJnH7hvFVbGM= -github.com/pierrec/lz4/v4 v4.1.12 h1:44l88ehTZAUGW4VlO1QC4zkilL99M6Y9MXNwEs0uzP8= -github.com/pierrec/lz4/v4 v4.1.12/go.mod h1:gZWDp/Ze/IJXGXf23ltt2EXimqmTUXEy0GFuRQyBid4= +github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= +github.com/klauspost/compress v1.16.7 h1:2mk3MPGNzKyxErAw8YaohYh69+pa4sIQSC0fPGCFR9I= +github.com/kr/pretty v0.3.0 h1:WgNl7dwNpEZ6jJ9k1snq4pZsg7DOEN8hP9Xw0Tsjwk0= +github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY= +github.com/pierrec/lz4/v4 v4.1.21 h1:yOVMLb6qSIDP67pl/5F7RepeKYu/VmTyEXvuMI5d9mQ= github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= -github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= -github.com/stretchr/testify v1.7.0 h1:nwc3DEeHmmLAfoZucVR881uASk0Mfjw8xYJ99tb5CcY= -github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= -gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405 h1:yhCVgyC4o1eVCa2tZl7eS0r+SDo693bJlVdllGtEeKM= +github.com/rogpeppe/go-internal v1.6.1 h1:/FiVV8dS/e+YqF2JvO3yXRFbBLTIuSDkuC7aBOAvL+k= +github.com/stretchr/testify v1.9.0 h1:HtqpIVDClZ4nwg75+f6Lvsy/wHu+3BoSGCbBAcpTsTg= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= -gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c h1:dUUwHk2QECo/6vqA44rthZ8ie2QXMNeKRTHCNY2nXvo= -gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= +gopkg.in/check.v1 v1.0.0-20180628173108-788fd7840127 h1:qIbj1fsPNlZgppZ+VLlY7N33q108Sa+fhmuc+sWQYwY= +gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= diff --git a/go/ros/go.mod b/go/ros/go.mod index 342b997784..d259c05b10 100644 --- a/go/ros/go.mod +++ b/go/ros/go.mod @@ -3,15 +3,18 @@ module github.com/foxglove/mcap/go/ros go 1.18 require ( + github.com/foxglove/go-rosbag v0.0.6 github.com/foxglove/mcap/go/mcap v0.4.0 - github.com/mattn/go-sqlite3 v1.14.11 - github.com/pierrec/lz4/v4 v4.1.17 - github.com/stretchr/testify v1.8.1 + github.com/mattn/go-sqlite3 v1.14.14 + github.com/pierrec/lz4/v4 v4.1.21 + github.com/stretchr/testify v1.9.0 ) require ( github.com/davecgh/go-spew v1.1.1 // indirect - github.com/klauspost/compress v1.15.15 // indirect + github.com/klauspost/compress v1.16.7 // indirect + github.com/kr/pretty v0.3.0 // indirect github.com/pmezard/go-difflib v1.0.0 // indirect + gopkg.in/check.v1 v1.0.0-20180628173108-788fd7840127 // indirect gopkg.in/yaml.v3 v3.0.1 // indirect ) diff --git a/go/ros/go.sum b/go/ros/go.sum index 8e2341bcef..ed6e789d23 100644 --- a/go/ros/go.sum +++ b/go/ros/go.sum @@ -1,37 +1,18 @@ -github.com/davecgh/go-spew v1.1.0 h1:ZDRjVQ15GmhC3fiQ8ni8+OwkZQO4DARzQgrnXU1Liz8= -github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= -github.com/foxglove/mcap/go/mcap v0.0.0-20220316142927-cc81709134cd h1:IG8HSe6kkuB4TyoRhp1XYTrjYvy0iUGJXAzfhkJ5By8= -github.com/foxglove/mcap/go/mcap v0.0.0-20220316142927-cc81709134cd/go.mod h1:gQrB8PzccHW69xedSZ0uVDQVgDd3h1qX+otbS6fjSkE= +github.com/foxglove/go-rosbag v0.0.6 h1:LcWr1LqdS1NxWO4+mbPfo7d1jpL3gybqRmX1abD8eAw= github.com/foxglove/mcap/go/mcap v0.4.0 h1:jsDZZ6qmMKa174EE8Tw0hxeMUdgjz8emTlN8+6FEnXE= github.com/foxglove/mcap/go/mcap v0.4.0/go.mod h1:3UsmtxZGHWURgxEgQh3t0cGfyPyLoCGsa/gtS/Y6UPM= -github.com/klauspost/compress v1.14.1 h1:hLQYb23E8/fO+1u53d02A97a8UnsddcvYzq4ERRU4ds= -github.com/klauspost/compress v1.14.1/go.mod h1:/3/Vjq9QcHkK5uEr5lBEmyoZ1iFhe47etQ6QUkpK6sk= -github.com/klauspost/compress v1.15.12 h1:YClS/PImqYbn+UILDnqxQCZ3RehC9N318SU3kElDUEM= -github.com/klauspost/compress v1.15.12/go.mod h1:QPwzmACJjUTFsnSHH934V6woptycfrDDJnH7hvFVbGM= -github.com/klauspost/compress v1.15.15 h1:EF27CXIuDsYJ6mmvtBRlEuB2UVOqHG1tAXgZ7yIO+lw= -github.com/klauspost/compress v1.15.15/go.mod h1:ZcK2JAFqKOpnBlxcLsJzYfrS9X1akm9fHZNnD9+Vo/4= -github.com/mattn/go-sqlite3 v1.14.11 h1:gt+cp9c0XGqe9S/wAHTL3n/7MqY+siPWgWJgqdsFrzQ= -github.com/mattn/go-sqlite3 v1.14.11/go.mod h1:NyWgC/yNuGj7Q9rpYnZvas74GogHl5/Z4A/KQRfk6bU= -github.com/pierrec/lz4/v4 v4.1.12 h1:44l88ehTZAUGW4VlO1QC4zkilL99M6Y9MXNwEs0uzP8= -github.com/pierrec/lz4/v4 v4.1.12/go.mod h1:gZWDp/Ze/IJXGXf23ltt2EXimqmTUXEy0GFuRQyBid4= -github.com/pierrec/lz4/v4 v4.1.17 h1:kV4Ip+/hUBC+8T6+2EgburRtkE9ef4nbY3f4dFhGjMc= -github.com/pierrec/lz4/v4 v4.1.17/go.mod h1:gZWDp/Ze/IJXGXf23ltt2EXimqmTUXEy0GFuRQyBid4= +github.com/klauspost/compress v1.16.7 h1:2mk3MPGNzKyxErAw8YaohYh69+pa4sIQSC0fPGCFR9I= +github.com/kr/pretty v0.3.0 h1:WgNl7dwNpEZ6jJ9k1snq4pZsg7DOEN8hP9Xw0Tsjwk0= +github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY= +github.com/mattn/go-sqlite3 v1.14.14 h1:qZgc/Rwetq+MtyE18WhzjokPD93dNqLGNT3QJuLvBGw= +github.com/pierrec/lz4/v4 v4.1.21 h1:yOVMLb6qSIDP67pl/5F7RepeKYu/VmTyEXvuMI5d9mQ= github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= -github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= -github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSSt89Yw= -github.com/stretchr/objx v0.5.0/go.mod h1:Yh+to48EsGEfYuaHDzXPcE3xhTkx73EhmCGUpEOglKo= -github.com/stretchr/testify v1.7.0 h1:nwc3DEeHmmLAfoZucVR881uASk0Mfjw8xYJ99tb5CcY= -github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= -github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= -github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU= -github.com/stretchr/testify v1.8.1 h1:w7B6lhMri9wdJUVmEZPGGhZzrYTPvgJArz7wNPgYKsk= -github.com/stretchr/testify v1.8.1/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4= -gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405 h1:yhCVgyC4o1eVCa2tZl7eS0r+SDo693bJlVdllGtEeKM= +github.com/rogpeppe/go-internal v1.6.1 h1:/FiVV8dS/e+YqF2JvO3yXRFbBLTIuSDkuC7aBOAvL+k= +github.com/stretchr/testify v1.9.0 h1:HtqpIVDClZ4nwg75+f6Lvsy/wHu+3BoSGCbBAcpTsTg= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= -gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c h1:dUUwHk2QECo/6vqA44rthZ8ie2QXMNeKRTHCNY2nXvo= -gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= +gopkg.in/check.v1 v1.0.0-20180628173108-788fd7840127 h1:qIbj1fsPNlZgppZ+VLlY7N33q108Sa+fhmuc+sWQYwY= gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= From ebea94600c393864dd8fa41e840e90629960bc6d Mon Sep 17 00:00:00 2001 From: Bryan Fox <39674+bryfox@users.noreply.github.com> Date: Fri, 26 Jul 2024 11:41:13 -0400 Subject: [PATCH 22/44] Update go linter (#1204) This upgrades the version of `golangci-lint` to the latest version, and removes three deprecated linters, which now error when included because they are completely unused: https://github.com/foxglove/mcap/pull/1192#issuecomment-2252886768. --- .github/workflows/ci.yml | 2 +- go/.golangci.yaml | 3 --- 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 8e9eff44b6..e7e2083df1 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -387,7 +387,7 @@ jobs: with: go-version-file: go/go.work - name: install golangci-lint - run: go install github.com/golangci/golangci-lint/cmd/golangci-lint@v1.56.2 + run: go install github.com/golangci/golangci-lint/cmd/golangci-lint@v1.59.1 - run: make lint - run: make test - name: Check library version diff --git a/go/.golangci.yaml b/go/.golangci.yaml index 4371be5156..c5c1d92dc3 100644 --- a/go/.golangci.yaml +++ b/go/.golangci.yaml @@ -4,7 +4,6 @@ run: linters: disable-all: true enable: - - deadcode - errcheck - gosimple - govet @@ -12,11 +11,9 @@ linters: - staticcheck - typecheck - unused - - varcheck - gocritic - godot - gofmt - - ifshort - misspell - prealloc - sqlclosecheck From 4a9145db0bc9eaa5bd266ebc4ca4bac5a30bf56d Mon Sep 17 00:00:00 2001 From: Eric Miller Date: Sun, 28 Jul 2024 21:37:14 -0700 Subject: [PATCH 23/44] rust: Support writing without chunks (#1201) On very memory-constrained systems like microcontrollers, the overhead of storing all the chunk offset data in memory until the end of the file can be significant. To avoid that, we instead write all Channel, Schema, and Message data directly as records rather than using chunks, and rely on the reader (or an intermediate ingestion process) to generate the index information. ### Changelog The rust library now (optionally) supports writing unchunked MCAP files for decreased memory footprint. ### Docs None ### Description In our internal testing, storing chunk data required >40KB of memory for several-minute logs, which is a lot for an embedded system with only 300KB of RAM total. Notably, this grows without bound during the test, since the offsets and data about all chunks needs to be stored until the end of the log when it will be written to the index. For our usecase, it's acceptable to disable chunking entirely (along with compression and indexing), since we can add those things later during the log upload process. We're using this internally on an ESP32-S3 and it is working well.
BeforeAfter
--------- Co-authored-by: Eric Miller --- .vscode/settings.json | 3 ++ rust/src/write.rs | 93 +++++++++++++++++++++++++++++++++++-------- rust/tests/message.rs | 15 ++++++- 3 files changed, 93 insertions(+), 18 deletions(-) diff --git a/.vscode/settings.json b/.vscode/settings.json index d65b6a426c..05b6fab2c7 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -47,5 +47,8 @@ "C_Cpp.default.cppStandard": "c++17", "[go]": { "editor.defaultFormatter": "golang.go" + }, + "[rust]": { + "editor.defaultFormatter": "rust-lang.rust-analyzer" } } diff --git a/rust/src/write.rs b/rust/src/write.rs index 4d3c44d33b..32e47a0524 100644 --- a/rust/src/write.rs +++ b/rust/src/write.rs @@ -108,6 +108,7 @@ pub struct WriteOptions { compression: Option, profile: String, chunk_size: Option, + use_chunks: bool, } impl Default for WriteOptions { @@ -119,6 +120,7 @@ impl Default for WriteOptions { compression: None, profile: String::new(), chunk_size: Some(1024 * 768), + use_chunks: true, } } } @@ -157,6 +159,18 @@ impl WriteOptions { } } + /// specifies whether to use chunks for storing messages. + /// + /// If `false`, messages will be written directly to the data section of the file. + /// This prevents using compression or indexing, but may be useful on small embedded systems + /// that cannot afford the memory overhead of storing chunk metadata for the entire recording. + /// + /// Note that it's often useful to post-process a non-chunked file using `mcap recover` to add + /// indexes for efficient processing. + pub fn use_chunks(self, use_chunks: bool) -> Self { + Self { use_chunks, ..self } + } + /// Creates a [`Writer`] whch writes to `w` using the given options pub fn create<'a, W: Write + Seek>(self, w: W) -> McapResult> { Writer::with_options(w, self) @@ -227,8 +241,21 @@ impl<'a, W: Write + Seek> Writer<'a, W> { .channels .insert(chan.clone(), next_channel_id) .is_none()); - self.chunkin_time()? - .write_channel(next_channel_id, schema_id, chan)?; + if self.options.use_chunks { + self.chunkin_time()? + .write_channel(next_channel_id, schema_id, chan)?; + } else { + write_record( + self.finish_chunk()?, + &Record::Channel(records::Channel { + id: next_channel_id, + schema_id, + topic: chan.topic.clone(), + message_encoding: chan.message_encoding.clone(), + metadata: chan.metadata.clone(), + }), + )?; + } Ok(next_channel_id) } @@ -244,7 +271,21 @@ impl<'a, W: Write + Seek> Writer<'a, W> { .schemas .insert(schema.clone(), next_schema_id) .is_none()); - self.chunkin_time()?.write_schema(next_schema_id, schema)?; + if self.options.use_chunks { + self.chunkin_time()?.write_schema(next_schema_id, schema)?; + } else { + write_record( + self.finish_chunk()?, + &Record::Schema { + header: records::SchemaHeader { + id: next_schema_id, + name: schema.name.clone(), + encoding: schema.encoding.clone(), + }, + data: Cow::Borrowed(&schema.data), + }, + )?; + } Ok(next_schema_id) } @@ -301,7 +342,17 @@ impl<'a, W: Write + Seek> Writer<'a, W> { } } - self.chunkin_time()?.write_message(header, data)?; + if self.options.use_chunks { + self.chunkin_time()?.write_message(header, data)?; + } else { + write_record( + self.finish_chunk()?, + &Record::Message { + header: *header, + data: Cow::Borrowed(data), + }, + )?; + } Ok(()) } @@ -389,6 +440,11 @@ impl<'a, W: Write + Seek> Writer<'a, W> { // (That would leave it in an unspecified state if we bailed here!) // Instead briefly swap it out for a null writer while we set up the chunker // The writer will only be None if finish() was called. + assert!( + self.options.use_chunks, + "Trying to write to a chunk when chunking is disabled" + ); + let prev_writer = self.writer.take().expect(Self::WHERE_WRITER); self.writer = Some(match prev_writer { @@ -566,18 +622,23 @@ impl<'a, W: Write + Seek> Writer<'a, W> { }); } - // Write all chunk indexes. - let chunk_indexes_start = channels_end; - for index in chunk_indexes { - write_record(&mut ccw, &Record::ChunkIndex(index))?; - } - let chunk_indexes_end = posit(&mut ccw)?; - if chunk_indexes_end - chunk_indexes_start > 0 { - offsets.push(records::SummaryOffset { - group_opcode: op::CHUNK_INDEX, - group_start: chunk_indexes_start, - group_length: chunk_indexes_end - chunk_indexes_start, - }); + let chunk_indexes_end; + if self.options.use_chunks { + // Write all chunk indexes. + let chunk_indexes_start = channels_end; + for index in chunk_indexes { + write_record(&mut ccw, &Record::ChunkIndex(index))?; + } + chunk_indexes_end = posit(&mut ccw)?; + if chunk_indexes_end - chunk_indexes_start > 0 { + offsets.push(records::SummaryOffset { + group_opcode: op::CHUNK_INDEX, + group_start: chunk_indexes_start, + group_length: chunk_indexes_end - chunk_indexes_start, + }); + } + } else { + chunk_indexes_end = channels_end; } // ...and attachment indexes diff --git a/rust/tests/message.rs b/rust/tests/message.rs index 1b63f094ed..eb92a2b61c 100644 --- a/rust/tests/message.rs +++ b/rust/tests/message.rs @@ -39,11 +39,22 @@ fn smoke() -> Result<()> { #[test] fn round_trip() -> Result<()> { + run_round_trip(true) +} + +#[test] +fn round_trip_no_chunks() -> Result<()> { + run_round_trip(false) +} + +fn run_round_trip(use_chunks: bool) -> Result<()> { let mapped = map_mcap("../tests/conformance/data/OneMessage/OneMessage.mcap")?; let messages = mcap::MessageStream::new(&mapped)?; let mut tmp = tempfile()?; - let mut writer = mcap::Writer::new(BufWriter::new(&mut tmp))?; + let mut writer = mcap::WriteOptions::default() + .use_chunks(use_chunks) + .create(BufWriter::new(&mut tmp))?; for m in messages { writer.write(&m?)?; @@ -71,7 +82,7 @@ fn round_trip() -> Result<()> { message_count: 1, schema_count: 1, channel_count: 1, - chunk_count: 1, + chunk_count: if use_chunks { 1 } else { 0 }, message_start_time: 2, message_end_time: 2, channel_message_counts: [(0, 1)].into(), From 71c5aa33d2c8024e56ea0746ab1598f55400cae0 Mon Sep 17 00:00:00 2001 From: james-rms Date: Thu, 1 Aug 2024 07:49:33 +1000 Subject: [PATCH 24/44] go: return a bad magic error in check when parsing summary (#1206) ### Changelog - Changed: when attempting to read a corrupt MCAP using using the index, the reader will return an ErrBadMagic instance rather than a bare Error. ### Docs None. ### Description it's useful to be able to tell this error apart from errors that might result from i/o, but right now there is no simple way to determine this using `errors.Is`. This PR gives the error a publicly known type, so that callers can check for it.
BeforeAfter
--- go/mcap/indexed_message_iterator.go | 2 +- go/mcap/version.go | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/go/mcap/indexed_message_iterator.go b/go/mcap/indexed_message_iterator.go index 5aae96f08d..ba06c4e521 100644 --- a/go/mcap/indexed_message_iterator.go +++ b/go/mcap/indexed_message_iterator.go @@ -83,7 +83,7 @@ func (it *indexedMessageIterator) parseSummarySection() error { } magic := buf[20:] if !bytes.Equal(magic, Magic) { - return fmt.Errorf("not an MCAP file") + return &ErrBadMagic{location: magicLocationEnd, actual: magic} } footer, err := ParseFooter(buf[:20]) if err != nil { diff --git a/go/mcap/version.go b/go/mcap/version.go index 16fa2d69e8..0f69efec2c 100644 --- a/go/mcap/version.go +++ b/go/mcap/version.go @@ -1,4 +1,4 @@ package mcap // Version of the MCAP library. -var Version = "v1.4.1" +var Version = "v1.5.0" From 00eddd341fc71809711c7aec702e338e056607d8 Mon Sep 17 00:00:00 2001 From: Adrian Macneil Date: Thu, 1 Aug 2024 14:43:02 -0700 Subject: [PATCH 25/44] Remove CLA comment from readme (#1207) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We have decided to remove the Contributor License Agreement (CLA) from MCAP and other Foxglove open source projects. The project remains MIT licensed, but we no longer require an additional copyright grant from contributors 🎉. Per the [GitHub Terms of Service](https://docs.github.com/en/site-policy/github-terms/github-terms-of-service#6-contributions-under-repository-license), submitted pull requests are licensed under the same terms (MIT). image --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index b8b229a9f3..f59379e2e8 100644 --- a/README.md +++ b/README.md @@ -35,7 +35,7 @@ Download the latest `mcap-cli` version from the [releases page](https://github.c ## License -[MIT License](/LICENSE). Contributors are required to accept the [Contributor License Agreement](https://github.com/foxglove/cla). +[MIT License](/LICENSE). ## Release process From c95275f41fc931f3f444c3d3b9edccb4d677c17d Mon Sep 17 00:00:00 2001 From: Aaron O'Mullan Date: Mon, 12 Aug 2024 05:25:10 +0300 Subject: [PATCH 26/44] perf(ts): denoise baseline mem usage (#1209) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - TS_NODE_TRANSPILE_ONLY=true - Use v8 heap stats instead of memory usage (isolate vs process heap) - Baseline before each bench iter and tracking delta ``` # Current main McapStreamReader 3.01±0.08 op/s Heap Used: 268.92±3.81 MB/op Heap Total: 297.93±4.13 MB/op ArrayBuffers: 213.39±2.40 MB/op McapIndexedReader 2.13±0.01 op/s Heap Used: 284.31±3.36 MB/op Heap Total: 316.23±1.45 MB/op ArrayBuffers: 107.06±0.00 MB/op McapIndexedReader_reverse 2.14±0.02 op/s Heap Used: 280.16±3.09 MB/op Heap Total: 315.45±0.79 MB/op ArrayBuffers: 107.06±0.00 MB/op # New heap baseline (ts-node transpile only + delta tracking) McapStreamReader 3.44±0.03 op/s Heap Used: 51.80±12.44 MB/op Heap Total: 42.89±11.53 MB/op ArrayBuffers: 112.95±6.87 MB/op McapIndexedReader 2.16±0.01 op/s Heap Used: 70.75±2.55 MB/op Heap Total: 60.15±2.89 MB/op ArrayBuffers: 17.86±0.76 MB/op McapIndexedReader_reverse 2.17±0.03 op/s Heap Used: 59.98±2.50 MB/op Heap Total: 40.93±1.26 MB/op ArrayBuffers: 16.00±0.00 MB/op # New baseline on first change (bigint pr) McapStreamReader 4.20±0.07 op/s Heap Used: 35.31±2.60 MB/op Heap Total: 28.69±2.44 MB/op ArrayBuffers: 118.47±10.33 MB/op McapIndexedReader 2.58±0.01 op/s Heap Used: 23.74±2.33 MB/op Heap Total: 12.38±1.05 MB/op ArrayBuffers: 1.42±3.41 MB/op McapIndexedReader_reverse 2.60±0.00 op/s Heap Used: 24.55±0.72 MB/op Heap Total: 14.05±0.32 MB/op ArrayBuffers: 5.60±0.98 MB/op ``` --- typescript/benchmarks/bench.ts | 23 ++++++++++++++++------- typescript/benchmarks/package.json | 4 ++-- 2 files changed, 18 insertions(+), 9 deletions(-) diff --git a/typescript/benchmarks/bench.ts b/typescript/benchmarks/bench.ts index d0baead94f..a37926fe47 100644 --- a/typescript/benchmarks/bench.ts +++ b/typescript/benchmarks/bench.ts @@ -1,4 +1,5 @@ import { hrtime, memoryUsage } from "process"; +import { getHeapStatistics } from "v8"; const COUNT = 5; @@ -9,10 +10,8 @@ type BenchmarkResult = samples: { duration: bigint; memoryUsage: { - rss: number; - heapTotal: number; - heapUsed: number; - external: number; + usedHeapSize: number; + totalHeapSize: number; arrayBuffers: number; }; }[]; @@ -40,12 +39,22 @@ export async function runBenchmark(name: string, run: () => Promise): Prom }; for (let i = 0; i < COUNT; i++) { global.gc(); + const baseline = getHeapStatistics(); + const baselineArrayBuffers = memoryUsage().arrayBuffers; const before = hrtime.bigint(); + await run(); + const after = hrtime.bigint(); + const currentMemoryUsage = getHeapStatistics(); + const currentArrayBuffers = process.memoryUsage().arrayBuffers; result.samples.push({ duration: after - before, - memoryUsage: memoryUsage(), + memoryUsage: { + usedHeapSize: currentMemoryUsage.used_heap_size - baseline.used_heap_size, + totalHeapSize: currentMemoryUsage.total_heap_size - baseline.total_heap_size, + arrayBuffers: currentArrayBuffers - baselineArrayBuffers, + }, }); } } else { @@ -84,11 +93,11 @@ function printStats(result: BenchmarkResult) { let memoryResult = "(use --expose-gc to gather memory statistics)"; if (result.gcExposed) { const used = humanReadableStatistics( - result.samples.map((sample) => sample.memoryUsage.heapUsed / 2 ** 20), + result.samples.map((sample) => sample.memoryUsage.usedHeapSize / 2 ** 20), "MB/op", ); const total = humanReadableStatistics( - result.samples.map((sample) => sample.memoryUsage.heapTotal / 2 ** 20), + result.samples.map((sample) => sample.memoryUsage.totalHeapSize / 2 ** 20), "MB/op", ); const arrayBuffers = humanReadableStatistics( diff --git a/typescript/benchmarks/package.json b/typescript/benchmarks/package.json index 9121b5f574..686bfd7c12 100644 --- a/typescript/benchmarks/package.json +++ b/typescript/benchmarks/package.json @@ -17,8 +17,8 @@ "typecheck": "tsc -p tsconfig.json --noEmit", "lint:ci": "eslint --report-unused-disable-directives .", "lint": "eslint --report-unused-disable-directives --fix .", - "bench": "TS_NODE_FILES=true TS_NODE_PROJECT=tsconfig.cjs.json node --expose-gc -r 'ts-node/register' index.ts", - "bench:debug": "TS_NODE_FILES=true TS_NODE_PROJECT=tsconfig.cjs.json node --inspect-brk --expose-gc -r 'ts-node/register' index.ts" + "bench": "TS_NODE_TRANSPILE_ONLY=true TS_NODE_FILES=true TS_NODE_PROJECT=tsconfig.cjs.json node --huge-max-old-generation-size --expose-gc -r 'ts-node/register' index.ts", + "bench:debug": "TS_NODE_FILES=true TS_NODE_PROJECT=tsconfig.cjs.json node --huge-max-old-generation-size --inspect-brk --expose-gc -r 'ts-node/register' index.ts" }, "devDependencies": { "@foxglove/eslint-plugin": "1.0.1", From 7f82b06fc9366b30670a3749e35f9b638a0a4df7 Mon Sep 17 00:00:00 2001 From: Aaron O'Mullan Date: Wed, 14 Aug 2024 04:11:44 +0300 Subject: [PATCH 27/44] perf(rust): add MessageStream benches (#1213) Benches MessageStream on 1M messages uncompressed/zstd/lz4. ### Baseline (m1 pro) ``` mcap_read/MessageStream_1M_uncompressed time: [67.954 ms 68.277 ms 68.946 ms] thrpt: [14.504 Melem/s 14.646 Melem/s 14.716 Melem/s] mcap_read/MessageStream_1M_lz4 time: [337.77 ms 344.16 ms 350.76 ms] thrpt: [2.8510 Melem/s 2.9056 Melem/s 2.9606 Melem/s] mcap_read/MessageStream_1M_zstd time: [381.62 ms 384.54 ms 388.35 ms] thrpt: [2.5750 Melem/s 2.6005 Melem/s 2.6204 Melem/s] ``` --- rust/Cargo.toml | 10 +++++ rust/benches/reader.rs | 91 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 101 insertions(+) create mode 100644 rust/benches/reader.rs diff --git a/rust/Cargo.toml b/rust/Cargo.toml index 6ce4a85fef..6c9f0fb8c3 100644 --- a/rust/Cargo.toml +++ b/rust/Cargo.toml @@ -40,6 +40,7 @@ anyhow = "1" atty = "0.2" camino = "1.0" clap = { version = "3.2", features = ["derive"]} +criterion = "0.5.1" itertools = "0.10" memmap = "0.7" rayon = "1.5" @@ -47,3 +48,12 @@ serde = { version = "1.0.145", features = ["derive"] } serde_json = "1" simplelog = "0.12" tempfile = "3.3" + +[[bench]] +name = "reader" +harness = false + +[profile.bench] +opt-level = 3 +debug = true +lto = true diff --git a/rust/benches/reader.rs b/rust/benches/reader.rs new file mode 100644 index 0000000000..d234ccf6d3 --- /dev/null +++ b/rust/benches/reader.rs @@ -0,0 +1,91 @@ +use criterion::{criterion_group, criterion_main, Criterion}; +use mcap::{Channel, Message, MessageStream, Schema}; +use std::borrow::Cow; +use std::io::Cursor; +use std::sync::Arc; +use std::time::Duration; + +fn create_test_mcap(n: usize, compression: Option) -> Vec { + let mut buffer = Vec::new(); + { + let mut writer = mcap::WriteOptions::new() + .compression(compression) + .profile("fooey") + .create(Cursor::new(&mut buffer)) + .unwrap(); + // Mock message data to align with reader benchmarks in ts + const MESSAGE_DATA: &[u8] = &[42; 10]; + + let schema = Arc::new(Schema { + name: "TestSchema".to_string(), + encoding: "raw".to_string(), + data: Cow::Borrowed(b"{}"), + }); + + let channel = Arc::new(Channel { + topic: "test_topic".to_string(), + message_encoding: "raw".to_string(), + metadata: Default::default(), + schema: Some(schema), + }); + + for i in 0..n { + let message = Message { + channel: channel.clone(), + sequence: i as u32, + log_time: i as u64, + publish_time: i as u64, + data: Cow::Borrowed(&MESSAGE_DATA), + }; + writer.write(&message).unwrap(); + } + + writer.finish().unwrap(); + } + buffer +} + +fn bench_read_messages(c: &mut Criterion) { + const N: usize = 1_000_000; + let mcap_data_uncompressed = create_test_mcap(N, None); + let mcap_data_lz4 = create_test_mcap(N, Some(mcap::Compression::Lz4)); + let mcap_data_zstd = create_test_mcap(N, Some(mcap::Compression::Zstd)); + let mut group = c.benchmark_group("mcap_read"); + group.throughput(criterion::Throughput::Elements(N as u64)); + + group.bench_function("MessageStream_1M_uncompressed", |b| { + b.iter(|| { + let stream = MessageStream::new(&mcap_data_uncompressed).unwrap(); + for message in stream { + std::hint::black_box(message.unwrap()); + } + }); + }); + + group.bench_function("MessageStream_1M_lz4", |b| { + b.iter(|| { + let stream = MessageStream::new(&mcap_data_lz4).unwrap(); + for message in stream { + std::hint::black_box(message.unwrap()); + } + }); + }); + + group.bench_function("MessageStream_1M_zstd", |b| { + b.iter(|| { + let stream = MessageStream::new(&mcap_data_zstd).unwrap(); + for message in stream { + std::hint::black_box(message.unwrap()); + } + }); + }); + + group.finish(); +} + +criterion_group! { + name = benches; + config = Criterion::default().warm_up_time(Duration::from_secs(1)).sample_size(10); + targets = bench_read_messages +} +criterion_main!(benches); From 7fb9dfa1d3fad5c075a0232f32c364a35f3e2e71 Mon Sep 17 00:00:00 2001 From: Aaron O'Mullan Date: Wed, 14 Aug 2024 04:55:02 +0300 Subject: [PATCH 28/44] perf(rust): prealloc decompression buffers (#1214) To avoid reallocs as decompressors write into them, boosts decompressed throughput by +40-50%. Also bump rust 0.9.1 => 0.9.2 --- rust/Cargo.toml | 2 +- rust/src/read.rs | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/rust/Cargo.toml b/rust/Cargo.toml index 6c9f0fb8c3..f2fa8ca0d2 100644 --- a/rust/Cargo.toml +++ b/rust/Cargo.toml @@ -7,7 +7,7 @@ categories = [ "science::robotics", "compression" ] repository = "https://github.com/foxglove/mcap" documentation = "https://docs.rs/mcap" readme = "README.md" -version = "0.9.1" +version = "0.9.2" edition = "2021" license = "MIT" diff --git a/rust/src/read.rs b/rust/src/read.rs index cd6808c089..72dad85fb2 100644 --- a/rust/src/read.rs +++ b/rust/src/read.rs @@ -368,7 +368,7 @@ fn read_record_from_chunk_stream<'a, R: Read>(r: &mut R) -> McapResult { - let mut record = Vec::new(); + let mut record = Vec::with_capacity(len as usize); r.take(len).read_to_end(&mut record)?; if len as usize != record.len() { return Err(McapError::UnexpectedEoc); @@ -396,7 +396,7 @@ fn read_record_from_chunk_stream<'a, R: Read>(r: &mut R) -> McapResult { - let mut record = Vec::new(); + let mut record = Vec::with_capacity(len as usize); r.take(len).read_to_end(&mut record)?; if len as usize != record.len() { return Err(McapError::UnexpectedEoc); @@ -421,14 +421,14 @@ fn read_record_from_chunk_stream<'a, R: Read>(r: &mut R) -> McapResult Date: Thu, 15 Aug 2024 03:49:50 +0300 Subject: [PATCH 29/44] perf(ts): reuse Reader whilst parsing records (#1212) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Reduces `McapStreamReader` heap usage by ~25% and boosts throughput by ~30%. This is both a refactor and perf boost, with more room for improvement, key changes: - Removes StreamBuffer, hoisted into McapStreamReader - Reuses Reader and DataView class across parse calls, only resetting them when necessary (e.g: append in McapStreamReader) - Splits `parseRecord` into small scoped parsing functions, this itself is perf neutral (slightly positive) but facilitates future monomorphic fast paths - Moves offsets tracking into Reader which is cleaner and faster ### Before ``` McapStreamReader 3.48±0.03 op/s Heap Used: 49.56±12.75 MB/op Heap Total: 41.47±11.83 MB/op ArrayBuffers: 112.95±6.87 MB/op McapIndexedReader 2.15±0.02 op/s Heap Used: 70.02±2.84 MB/op Heap Total: 58.34±3.36 MB/op ArrayBuffers: 17.86±0.76 MB/op McapIndexedReader_reverse 2.18±0.01 op/s Heap Used: 59.92±2.86 MB/op Heap Total: 39.81±1.00 MB/op ArrayBuffers: 14.58±1.42 MB/op ``` ### After ``` McapStreamReader 4.47±0.08 op/s Heap Used: 42.35±2.23 MB/op Heap Total: 32.93±3.76 MB/op ArrayBuffers: 105.93±12.19 MB/op McapIndexedReader 2.38±0.02 op/s Heap Used: 72.00±1.70 MB/op Heap Total: 55.12±2.51 MB/op ArrayBuffers: 17.86±1.85 MB/op McapIndexedReader_reverse 2.38±0.02 op/s Heap Used: 63.41±1.55 MB/op Heap Total: 39.33±0.53 MB/op ArrayBuffers: 18.40±1.60 MB/op ``` ### Followups - Simplify parsing code further after exploring monomorphic callpaths - Can further improve DataView/Reader churn in indexed readers --- typescript/core/src/ChunkCursor.ts | 25 +- typescript/core/src/McapIndexedReader.ts | 159 +++-- typescript/core/src/McapStreamReader.ts | 100 ++-- typescript/core/src/McapWriter.test.ts | 12 +- typescript/core/src/Reader.ts | 38 +- typescript/core/src/StreamBuffer.test.ts | 47 -- typescript/core/src/StreamBuffer.ts | 58 -- typescript/core/src/parse.ts | 717 ++++++++++++----------- 8 files changed, 556 insertions(+), 600 deletions(-) delete mode 100644 typescript/core/src/StreamBuffer.test.ts delete mode 100644 typescript/core/src/StreamBuffer.ts diff --git a/typescript/core/src/ChunkCursor.ts b/typescript/core/src/ChunkCursor.ts index 3a33303010..2113cc2b42 100644 --- a/typescript/core/src/ChunkCursor.ts +++ b/typescript/core/src/ChunkCursor.ts @@ -1,3 +1,4 @@ +import Reader from "./Reader"; import { parseRecord } from "./parse"; import { sortedIndexBy } from "./sortedIndexBy"; import { sortedLastIndexBy } from "./sortedLastIndex"; @@ -136,31 +137,25 @@ export class ChunkCursor { messageIndexes.byteLength, ); - let offset = 0; + const reader = new Reader(messageIndexesView); const arrayOfMessageOffsets: [logTime: bigint, offset: bigint][][] = []; - for ( - let result; - (result = parseRecord({ view: messageIndexesView, startOffset: offset, validateCrcs: true })), - result.record; - offset += result.usedBytes - ) { - if (result.record.type !== "MessageIndex") { + let record; + while ((record = parseRecord(reader, true))) { + if (record.type !== "MessageIndex") { continue; } if ( - result.record.records.length === 0 || - (this.#relevantChannels && !this.#relevantChannels.has(result.record.channelId)) + record.records.length === 0 || + (this.#relevantChannels && !this.#relevantChannels.has(record.channelId)) ) { continue; } - arrayOfMessageOffsets.push(result.record.records); + arrayOfMessageOffsets.push(record.records); } - if (offset !== messageIndexesView.byteLength) { - throw new Error( - `${messageIndexesView.byteLength - offset} bytes remaining in message index section`, - ); + if (reader.bytesRemaining() !== 0) { + throw new Error(`${reader.bytesRemaining()} bytes remaining in message index section`); } this.#orderedMessageOffsets = arrayOfMessageOffsets diff --git a/typescript/core/src/McapIndexedReader.ts b/typescript/core/src/McapIndexedReader.ts index 5955300a40..51ec00044e 100644 --- a/typescript/core/src/McapIndexedReader.ts +++ b/typescript/core/src/McapIndexedReader.ts @@ -2,6 +2,7 @@ import { crc32, crc32Final, crc32Init, crc32Update } from "@foxglove/crc"; import Heap from "heap-js"; import { ChunkCursor } from "./ChunkCursor"; +import Reader from "./Reader"; import { MCAP_MAGIC } from "./constants"; import { parseMagic, parseRecord } from "./parse"; import { DecompressHandlers, IReadable, TypedMcapRecords } from "./types"; @@ -111,7 +112,7 @@ export class McapIndexedReader { headerPrefix.byteOffset, headerPrefix.byteLength, ); - void parseMagic(headerPrefixView, 0); + void parseMagic(new Reader(headerPrefixView)); const headerContentLength = headerPrefixView.getBigUint64( MCAP_MAGIC.length + /* Opcode.HEADER */ 1, true, @@ -121,26 +122,19 @@ export class McapIndexedReader { const headerRecord = await readable.read(BigInt(MCAP_MAGIC.length), headerReadLength); headerEndOffset = BigInt(MCAP_MAGIC.length) + headerReadLength; - const headerResult = parseRecord({ - view: new DataView(headerRecord.buffer, headerRecord.byteOffset, headerRecord.byteLength), - startOffset: 0, - validateCrcs: true, - }); - if (headerResult.record?.type !== "Header") { + const headerReader = new Reader( + new DataView(headerRecord.buffer, headerRecord.byteOffset, headerRecord.byteLength), + ); + const headerResult = parseRecord(headerReader, true); + if (headerResult?.type !== "Header") { throw new Error( - `Unable to read header at beginning of file; found ${ - headerResult.record?.type ?? "nothing" - }`, + `Unable to read header at beginning of file; found ${headerResult?.type ?? "nothing"}`, ); } - if (headerResult.usedBytes !== headerRecord.byteLength) { - throw new Error( - `${ - headerRecord.byteLength - headerResult.usedBytes - } bytes remaining after parsing header`, - ); + if (headerReader.bytesRemaining() !== 0) { + throw new Error(`${headerReader.bytesRemaining()} bytes remaining after parsing header`); } - header = headerResult.record; + header = headerResult; } function errorWithLibrary(message: string): Error { @@ -179,33 +173,32 @@ export class McapIndexedReader { } try { - void parseMagic(footerAndMagicView, footerAndMagicView.byteLength - MCAP_MAGIC.length); + void parseMagic( + new Reader(footerAndMagicView, footerAndMagicView.byteLength - MCAP_MAGIC.length), + ); } catch (error) { throw errorWithLibrary((error as Error).message); } let footer: TypedMcapRecords["Footer"]; { - const footerResult = parseRecord({ - view: footerAndMagicView, - startOffset: 0, - validateCrcs: true, - }); - if (footerResult.record?.type !== "Footer") { + const footerReader = new Reader(footerAndMagicView); + const footerRecord = parseRecord(footerReader, true); + if (footerRecord?.type !== "Footer") { throw errorWithLibrary( `Unable to read footer from end of file (offset ${footerOffset}); found ${ - footerResult.record?.type ?? "nothing" + footerRecord?.type ?? "nothing" }`, ); } - if (footerResult.usedBytes !== footerAndMagicView.byteLength - MCAP_MAGIC.length) { + if (footerReader.bytesRemaining() !== MCAP_MAGIC.length) { throw errorWithLibrary( `${ - footerAndMagicView.byteLength - MCAP_MAGIC.length - footerResult.usedBytes + footerReader.bytesRemaining() - MCAP_MAGIC.length } bytes remaining after parsing footer`, ); } - footer = footerResult.record; + footer = footerRecord; } if (footer.summaryStart === 0n) { throw errorWithLibrary("File is not indexed"); @@ -261,6 +254,7 @@ export class McapIndexedReader { dataEndAndSummarySection.byteOffset, dataEndAndSummarySection.byteLength, ); + const indexReader = new Reader(indexView); const channelsById = new Map(); const schemasById = new Map(); @@ -271,46 +265,42 @@ export class McapIndexedReader { let statistics: TypedMcapRecords["Statistics"] | undefined; let dataSectionCrc: number | undefined; - let offset = 0; - for ( - let result; - (result = parseRecord({ view: indexView, startOffset: offset, validateCrcs: true })), - result.record; - offset += result.usedBytes - ) { - if (offset === 0 && result.record.type !== "DataEnd") { + let first = true; + let result; + while ((result = parseRecord(indexReader, true))) { + if (first && result.type !== "DataEnd") { throw errorWithLibrary( - `Expected DataEnd record to precede summary section, but found ${result.record.type}`, + `Expected DataEnd record to precede summary section, but found ${result.type}`, ); } - switch (result.record.type) { + first = false; + switch (result.type) { case "Schema": - schemasById.set(result.record.id, result.record); + schemasById.set(result.id, result); break; case "Channel": - channelsById.set(result.record.id, result.record); + channelsById.set(result.id, result); break; case "ChunkIndex": - chunkIndexes.push(result.record); + chunkIndexes.push(result); break; case "AttachmentIndex": - attachmentIndexes.push(result.record); + attachmentIndexes.push(result); break; case "MetadataIndex": - metadataIndexes.push(result.record); + metadataIndexes.push(result); break; case "Statistics": if (statistics) { throw errorWithLibrary("Duplicate Statistics record"); } - statistics = result.record; + statistics = result; break; case "SummaryOffset": - summaryOffsetsByOpcode.set(result.record.groupOpcode, result.record); + summaryOffsetsByOpcode.set(result.groupOpcode, result); break; case "DataEnd": - dataSectionCrc = - result.record.dataSectionCrc === 0 ? undefined : result.record.dataSectionCrc; + dataSectionCrc = result.dataSectionCrc === 0 ? undefined : result.dataSectionCrc; break; case "Header": case "Footer": @@ -319,13 +309,13 @@ export class McapIndexedReader { case "MessageIndex": case "Attachment": case "Metadata": - throw errorWithLibrary(`${result.record.type} record not allowed in index section`); + throw errorWithLibrary(`${result.type} record not allowed in index section`); case "Unknown": break; } } - if (offset !== indexView.byteLength) { - throw errorWithLibrary(`${indexView.byteLength - offset} bytes remaining in index section`); + if (indexReader.bytesRemaining() !== 0) { + throw errorWithLibrary(`${indexReader.bytesRemaining()} bytes remaining in index section`); } return new McapIndexedReader({ @@ -395,6 +385,7 @@ export class McapIndexedReader { // cursor becomes active (i.e. when we first need to access messages from the chunk) and removed // when the cursor is removed from the heap. const chunkViewCache = new Map(); + const chunkReader = new Reader(new DataView(new ArrayBuffer(0))); for (let cursor; (cursor = chunkCursors.peek()); ) { if (!cursor.hasMessageIndexes()) { // If we encounter a chunk whose message indexes have not been loaded yet, load them and re-organize the heap. @@ -421,27 +412,24 @@ export class McapIndexedReader { `Message offset beyond chunk bounds (log time ${logTime}, offset ${offset}, chunk data length ${chunkView.byteLength}) in chunk at offset ${cursor.chunkIndex.chunkStartOffset}`, ); } - const result = parseRecord({ - view: chunkView, - startOffset: Number(offset), - validateCrcs: validateCrcs ?? true, - }); - if (!result.record) { + chunkReader.reset(chunkView, Number(offset)); + const record = parseRecord(chunkReader, validateCrcs ?? true); + if (!record) { throw this.#errorWithLibrary( `Unable to parse record at offset ${offset} in chunk at offset ${cursor.chunkIndex.chunkStartOffset}`, ); } - if (result.record.type !== "Message") { + if (record.type !== "Message") { throw this.#errorWithLibrary( - `Unexpected record type ${result.record.type} in message index (time ${logTime}, offset ${offset} in chunk at offset ${cursor.chunkIndex.chunkStartOffset})`, + `Unexpected record type ${record.type} in message index (time ${logTime}, offset ${offset} in chunk at offset ${cursor.chunkIndex.chunkStartOffset})`, ); } - if (result.record.logTime !== logTime) { + if (record.logTime !== logTime) { throw this.#errorWithLibrary( - `Message log time ${result.record.logTime} did not match message index entry (${logTime} at offset ${offset} in chunk at offset ${cursor.chunkIndex.chunkStartOffset})`, + `Message log time ${record.logTime} did not match message index entry (${logTime} at offset ${offset} in chunk at offset ${cursor.chunkIndex.chunkStartOffset})`, ); } - yield result.record; + yield record; if (cursor.hasMoreMessages()) { // There is no need to reorganize the heap when chunks are ordered and not overlapping. @@ -468,19 +456,18 @@ export class McapIndexedReader { continue; } const metadataData = await this.#readable.read(metadataIndex.offset, metadataIndex.length); - const metadataResult = parseRecord({ - view: new DataView(metadataData.buffer, metadataData.byteOffset, metadataData.byteLength), - startOffset: 0, - validateCrcs: false, - }); - if (metadataResult.record?.type !== "Metadata") { + const metadataReader = new Reader( + new DataView(metadataData.buffer, metadataData.byteOffset, metadataData.byteLength), + ); + const metadataRecord = parseRecord(metadataReader, false); + if (metadataRecord?.type !== "Metadata") { throw this.#errorWithLibrary( `Metadata data at offset ${ metadataIndex.offset - } does not point to metadata record (found ${String(metadataResult.record?.type)})`, + } does not point to metadata record (found ${String(metadataRecord?.type)})`, ); } - yield metadataResult.record; + yield metadataRecord; } } @@ -519,23 +506,18 @@ export class McapIndexedReader { attachmentIndex.offset, attachmentIndex.length, ); - const attachmentResult = parseRecord({ - view: new DataView( - attachmentData.buffer, - attachmentData.byteOffset, - attachmentData.byteLength, - ), - startOffset: 0, - validateCrcs: validateCrcs ?? true, - }); - if (attachmentResult.record?.type !== "Attachment") { + const attachmentReader = new Reader( + new DataView(attachmentData.buffer, attachmentData.byteOffset, attachmentData.byteLength), + ); + const attachmentRecord = parseRecord(attachmentReader, validateCrcs ?? true); + if (attachmentRecord?.type !== "Attachment") { throw this.#errorWithLibrary( `Attachment data at offset ${ attachmentIndex.offset - } does not point to attachment record (found ${String(attachmentResult.record?.type)})`, + } does not point to attachment record (found ${String(attachmentRecord?.type)})`, ); } - yield attachmentResult.record; + yield attachmentRecord; } } @@ -547,20 +529,19 @@ export class McapIndexedReader { chunkIndex.chunkStartOffset, chunkIndex.chunkLength, ); - const chunkResult = parseRecord({ - view: new DataView(chunkData.buffer, chunkData.byteOffset, chunkData.byteLength), - startOffset: 0, - validateCrcs: options?.validateCrcs ?? true, - }); - if (chunkResult.record?.type !== "Chunk") { + const chunkReader = new Reader( + new DataView(chunkData.buffer, chunkData.byteOffset, chunkData.byteLength), + ); + const chunkRecord = parseRecord(chunkReader, options?.validateCrcs ?? true); + if (chunkRecord?.type !== "Chunk") { throw this.#errorWithLibrary( `Chunk start offset ${ chunkIndex.chunkStartOffset - } does not point to chunk record (found ${String(chunkResult.record?.type)})`, + } does not point to chunk record (found ${String(chunkRecord?.type)})`, ); } - const chunk = chunkResult.record; + const chunk = chunkRecord; let buffer = chunk.records; if (chunk.compression !== "" && buffer.byteLength > 0) { const decompress = this.#decompressHandlers?.[chunk.compression]; diff --git a/typescript/core/src/McapStreamReader.ts b/typescript/core/src/McapStreamReader.ts index ddd412ef38..6ed8292ce9 100644 --- a/typescript/core/src/McapStreamReader.ts +++ b/typescript/core/src/McapStreamReader.ts @@ -1,6 +1,6 @@ import { crc32 } from "@foxglove/crc"; -import StreamBuffer from "./StreamBuffer"; +import Reader from "./Reader"; import { MCAP_MAGIC } from "./constants"; import { parseMagic, parseRecord } from "./parse"; import { Channel, DecompressHandlers, McapMagic, TypedMcapRecord, TypedMcapRecords } from "./types"; @@ -50,7 +50,9 @@ type McapReaderOptions = { * ``` */ export default class McapStreamReader { - #buffer = new StreamBuffer(MCAP_MAGIC.length * 2); + #buffer = new ArrayBuffer(MCAP_MAGIC.length * 2); + #view = new DataView(this.#buffer); + #reader = new Reader(this.#view, MCAP_MAGIC.length * 2); // Cursor starts at end of initial buffer #decompressHandlers; #includeChunks; #validateCrcs; @@ -78,7 +80,7 @@ export default class McapStreamReader { /** @returns The number of bytes that have been received by `append()` but not yet parsed. */ bytesRemaining(): number { - return this.#buffer.bytesRemaining(); + return this.#reader.bytesRemaining(); } /** @@ -89,7 +91,41 @@ export default class McapStreamReader { if (this.#doneReading) { throw new Error("Already done reading"); } - this.#buffer.append(data); + this.#appendOrShift(data); + this.#reader.reset(this.#view); + } + #appendOrShift(data: Uint8Array): void { + /** Add data to the buffer, shifting existing data or reallocating if necessary. */ + const remainingBytes = this.#reader.bytesRemaining(); + const totalNeededBytes = remainingBytes + data.byteLength; + + if (totalNeededBytes <= this.#buffer.byteLength) { + // Data fits in the current buffer + if (this.#view.byteOffset + totalNeededBytes <= this.#buffer.byteLength) { + // Data fits by appending only + const array = new Uint8Array(this.#buffer, this.#view.byteOffset); + array.set(data, remainingBytes); + this.#view = new DataView(this.#buffer, this.#view.byteOffset, totalNeededBytes); + } else { + // Data fits but requires moving existing data to start of buffer + const existingData = new Uint8Array(this.#buffer, this.#view.byteOffset, remainingBytes); + const array = new Uint8Array(this.#buffer); + array.set(existingData, 0); + array.set(data, existingData.byteLength); + this.#view = new DataView(this.#buffer, 0, totalNeededBytes); + } + } else { + // New data doesn't fit, copy to a new buffer + + // Currently, the new buffer size may be smaller than the old size. For future optimizations, + // we could consider making the buffer size increase monotonically. + this.#buffer = new ArrayBuffer(totalNeededBytes * 2); + const array = new Uint8Array(this.#buffer); + const existingData = new Uint8Array(this.#view.buffer, this.#view.byteOffset, remainingBytes); + array.set(existingData, 0); + array.set(data, existingData.byteLength); + this.#view = new DataView(this.#buffer, 0, totalNeededBytes); + } } /** @@ -129,11 +165,10 @@ export default class McapStreamReader { *#read(): Generator { if (!this.#noMagicPrefix) { - let magic: McapMagic | undefined, usedBytes: number | undefined; - while ((({ magic, usedBytes } = parseMagic(this.#buffer.view, 0)), !magic)) { + let magic: McapMagic | undefined; + while (((magic = parseMagic(this.#reader)), !magic)) { yield; } - this.#buffer.consume(usedBytes); } let header: TypedMcapRecords["Header"] | undefined; @@ -144,20 +179,10 @@ export default class McapStreamReader { for (;;) { let record; - { - let usedBytes; - while ( - (({ record, usedBytes } = parseRecord({ - view: this.#buffer.view, - startOffset: 0, - validateCrcs: this.#validateCrcs, - })), - !record) - ) { - yield; - } - this.#buffer.consume(usedBytes); + while (((record = parseRecord(this.#reader, this.#validateCrcs)), !record)) { + yield; } + switch (record.type) { case "Unknown": break; @@ -206,18 +231,10 @@ export default class McapStreamReader { } } const view = new DataView(buffer.buffer, buffer.byteOffset, buffer.byteLength); - let chunkOffset = 0; - for ( - let chunkResult; - (chunkResult = parseRecord({ - view, - startOffset: chunkOffset, - validateCrcs: this.#validateCrcs, - })), - chunkResult.record; - chunkOffset += chunkResult.usedBytes - ) { - switch (chunkResult.record.type) { + const chunkReader = new Reader(view); + let chunkRecord; + while ((chunkRecord = parseRecord(chunkReader, this.#validateCrcs))) { + switch (chunkRecord.type) { case "Unknown": break; case "Header": @@ -232,34 +249,31 @@ export default class McapStreamReader { case "MetadataIndex": case "SummaryOffset": case "DataEnd": - throw errorWithLibrary( - `${chunkResult.record.type} record not allowed inside a chunk`, - ); + throw errorWithLibrary(`${chunkRecord.type} record not allowed inside a chunk`); case "Schema": case "Channel": case "Message": - yield chunkResult.record; + yield chunkRecord; break; } } - if (chunkOffset !== buffer.byteLength) { - throw errorWithLibrary(`${buffer.byteLength - chunkOffset} bytes remaining in chunk`); + if (chunkReader.bytesRemaining() !== 0) { + throw errorWithLibrary(`${chunkReader.bytesRemaining()} bytes remaining in chunk`); } break; } case "Footer": try { - let magic, usedBytes; - while ((({ magic, usedBytes } = parseMagic(this.#buffer.view, 0)), !magic)) { + let magic; + while (((magic = parseMagic(this.#reader)), !magic)) { yield; } - this.#buffer.consume(usedBytes); } catch (error) { throw errorWithLibrary((error as Error).message); } - if (this.#buffer.bytesRemaining() !== 0) { + if (this.#reader.bytesRemaining() !== 0) { throw errorWithLibrary( - `${this.#buffer.bytesRemaining()} bytes remaining after MCAP footer and trailing magic`, + `${this.#reader.bytesRemaining()} bytes remaining after MCAP footer and trailing magic`, ); } return record; diff --git a/typescript/core/src/McapWriter.test.ts b/typescript/core/src/McapWriter.test.ts index 57bdcd82fc..e16bc515d5 100644 --- a/typescript/core/src/McapWriter.test.ts +++ b/typescript/core/src/McapWriter.test.ts @@ -3,6 +3,7 @@ import { crc32 } from "@foxglove/crc"; import { McapIndexedReader } from "./McapIndexedReader"; import McapStreamReader from "./McapStreamReader"; import { McapWriter } from "./McapWriter"; +import Reader from "./Reader"; import { TempBuffer } from "./TempBuffer"; import { MCAP_MAGIC, Opcode } from "./constants"; import { parseMagic, parseRecord } from "./parse"; @@ -278,13 +279,12 @@ describe("McapWriter", () => { const array = tempBuffer.get(); const view = new DataView(array.buffer, array.byteOffset, array.byteLength); + const reader = new Reader(view); const records: TypedMcapRecord[] = []; - for ( - let offset = parseMagic(view, 0).usedBytes, result; - (result = parseRecord({ view, startOffset: offset, validateCrcs: true })), result.record; - offset += result.usedBytes - ) { - records.push(result.record); + parseMagic(reader); + let result; + while ((result = parseRecord(reader, true))) { + records.push(result); } const expectedChunkData = new Uint8Array([ diff --git a/typescript/core/src/Reader.ts b/typescript/core/src/Reader.ts index fcc2887237..d0136c648b 100644 --- a/typescript/core/src/Reader.ts +++ b/typescript/core/src/Reader.ts @@ -7,13 +7,27 @@ const textDecoder = new TextDecoder(); export default class Reader { #view: DataView; + #viewU8: Uint8Array; offset: number; constructor(view: DataView, offset = 0) { this.#view = view; + this.#viewU8 = new Uint8Array(view.buffer, view.byteOffset, view.byteLength); this.offset = offset; } + // Should be ~identical to the constructor, it allows us to reinitialize the reader when + // the view changes, without creating a new instance, avoiding allocation / GC overhead + reset(view: DataView, offset = 0): void { + this.#view = view; + this.#viewU8 = new Uint8Array(view.buffer, view.byteOffset, view.byteLength); + this.offset = offset; + } + + bytesRemaining(): number { + return this.#viewU8.length - this.offset; + } + uint8(): number { const value = this.#view.getUint8(this.offset); this.offset += 1; @@ -40,14 +54,12 @@ export default class Reader { string(): string { const length = this.uint32(); - if (this.offset + length > this.#view.byteLength) { + if (length === 0) { + return ""; + } else if (length > this.bytesRemaining()) { throw new Error(`String length ${length} exceeds bounds of buffer`); } - const value = textDecoder.decode( - new Uint8Array(this.#view.buffer, this.#view.byteOffset + this.offset, length), - ); - this.offset += length; - return value; + return textDecoder.decode(this.u8ArrayBorrow(length)); } keyValuePairs(readKey: (reader: Reader) => K, readValue: (reader: Reader) => V): [K, V][] { @@ -103,4 +115,18 @@ export default class Reader { } return result; } + + // Read a borrowed Uint8Array, useful temp references or borrow semantics + u8ArrayBorrow(length: number): Uint8Array { + const result = this.#viewU8.subarray(this.offset, this.offset + length); + this.offset += length; + return result; + } + + // Read a copied Uint8Array from the underlying buffer, use when you need to keep the data around + u8ArrayCopy(length: number): Uint8Array { + const result = this.#viewU8.slice(this.offset, this.offset + length); + this.offset += length; + return result; + } } diff --git a/typescript/core/src/StreamBuffer.test.ts b/typescript/core/src/StreamBuffer.test.ts deleted file mode 100644 index a45175b3e3..0000000000 --- a/typescript/core/src/StreamBuffer.test.ts +++ /dev/null @@ -1,47 +0,0 @@ -import StreamBuffer from "./StreamBuffer"; - -function toArray(view: DataView) { - return new Uint8Array(view.buffer, view.byteOffset, view.byteLength); -} - -describe("ByteStorage", () => { - it("handles basic append and consume", () => { - const buffer = new StreamBuffer(); - expect(buffer.bytesRemaining()).toBe(0); - - buffer.append(new Uint8Array([1, 2, 3])); - expect(buffer.bytesRemaining()).toBe(3); - expect(() => { - buffer.consume(4); - }).toThrow(); - - expect(toArray(buffer.view)).toEqual(new Uint8Array([1, 2, 3])); - buffer.consume(3); - expect(buffer.bytesRemaining()).toBe(0); - }); - - it("handles partial consume", () => { - const buffer = new StreamBuffer(); - - buffer.append(new Uint8Array([1, 2, 3, 4, 5])); - expect(buffer.bytesRemaining()).toBe(5); - buffer.consume(2); - expect(buffer.bytesRemaining()).toBe(3); - - expect(toArray(buffer.view)).toEqual(new Uint8Array([3, 4, 5])); - buffer.consume(3); - expect(buffer.bytesRemaining()).toBe(0); - }); - - it("reuses buffer within allocated capacity", () => { - const buffer = new StreamBuffer(5); - const rawBuffer = buffer.view.buffer; - buffer.append(new Uint8Array([1, 2])); - expect(buffer.view.buffer).toBe(rawBuffer); - buffer.append(new Uint8Array([3, 4, 5])); - expect(buffer.view.buffer).toBe(rawBuffer); - buffer.append(new Uint8Array([6, 7])); - expect(buffer.view.buffer).not.toBe(rawBuffer); - expect(toArray(buffer.view)).toEqual(new Uint8Array([1, 2, 3, 4, 5, 6, 7])); - }); -}); diff --git a/typescript/core/src/StreamBuffer.ts b/typescript/core/src/StreamBuffer.ts deleted file mode 100644 index 98eaa785d5..0000000000 --- a/typescript/core/src/StreamBuffer.ts +++ /dev/null @@ -1,58 +0,0 @@ -/** - * A growable buffer for use when processing a stream of data. - */ -export default class StreamBuffer { - #buffer: ArrayBuffer; - public view: DataView; - - constructor(initialCapacity = 0) { - this.#buffer = new ArrayBuffer(initialCapacity); - this.view = new DataView(this.#buffer, 0, 0); - } - - bytesRemaining(): number { - return this.view.byteLength; - } - - /** Mark some data as consumed, so the memory can be reused when new data is appended. */ - consume(count: number): void { - this.view = new DataView( - this.#buffer, - this.view.byteOffset + count, - this.view.byteLength - count, - ); - } - - /** Add data to the buffer, shifting existing data or reallocating if necessary. */ - append(data: Uint8Array): void { - if (this.view.byteOffset + this.view.byteLength + data.byteLength <= this.#buffer.byteLength) { - // Data fits by appending only - const array = new Uint8Array(this.view.buffer, this.view.byteOffset); - array.set(data, this.view.byteLength); - this.view = new DataView( - this.#buffer, - this.view.byteOffset, - this.view.byteLength + data.byteLength, - ); - } else if (this.view.byteLength + data.byteLength <= this.#buffer.byteLength) { - // Data fits in allocated buffer but requires moving existing data to start of buffer - const oldData = new Uint8Array(this.#buffer, this.view.byteOffset, this.view.byteLength); - const array = new Uint8Array(this.#buffer); - array.set(oldData, 0); - array.set(data, oldData.byteLength); - this.view = new DataView(this.#buffer, 0, this.view.byteLength + data.byteLength); - } else { - // New data doesn't fit, copy to a new buffer - - // Currently, the new buffer size may be smaller than the old size. For future optimizations, - // we could consider making the buffer size increase monotonically. - - const oldData = new Uint8Array(this.#buffer, this.view.byteOffset, this.view.byteLength); - this.#buffer = new ArrayBuffer((this.view.byteLength + data.byteLength) * 2); - const array = new Uint8Array(this.#buffer); - array.set(oldData, 0); - array.set(data, oldData.byteLength); - this.view = new DataView(this.#buffer, 0, this.view.byteLength + data.byteLength); - } - } -} diff --git a/typescript/core/src/parse.ts b/typescript/core/src/parse.ts index 7f2fe80285..95d0105750 100644 --- a/typescript/core/src/parse.ts +++ b/typescript/core/src/parse.ts @@ -1,374 +1,419 @@ import { crc32 } from "@foxglove/crc"; import Reader from "./Reader"; -import { isKnownOpcode, MCAP_MAGIC, Opcode } from "./constants"; +import { MCAP_MAGIC, Opcode } from "./constants"; import { McapMagic, TypedMcapRecord } from "./types"; /** * Parse a MCAP magic string at `startOffset` in `view`. */ -export function parseMagic( - view: DataView, - startOffset: number, -): { magic: McapMagic; usedBytes: number } | { magic?: undefined; usedBytes: 0 } { - if (startOffset + MCAP_MAGIC.length > view.byteLength) { - return { usedBytes: 0 }; +export function parseMagic(reader: Reader): McapMagic | undefined { + if (reader.bytesRemaining() < MCAP_MAGIC.length) { + return undefined; } - if (!MCAP_MAGIC.every((val, i) => val === view.getUint8(startOffset + i))) { + const magic = reader.u8ArrayBorrow(MCAP_MAGIC.length); + if (!MCAP_MAGIC.every((val, i) => val === magic[i])) { throw new Error( `Expected MCAP magic '${MCAP_MAGIC.map((val) => val.toString(16).padStart(2, "0")).join( " ", - )}', found '${Array.from(MCAP_MAGIC, (_, i) => - view - .getUint8(startOffset + i) - .toString(16) - .padStart(2, "0"), - ).join(" ")}'`, + )}', found '${Array.from(magic, (_, i) => magic[i]!.toString(16).padStart(2, "0")).join( + " ", + )}'`, ); } - return { - magic: { specVersion: "0" }, - usedBytes: MCAP_MAGIC.length, - }; + return { specVersion: "0" }; } /** - * Parse a MCAP record beginning at `startOffset` in `view`. + * Parse a MCAP record from the given reader */ -export function parseRecord({ - view, - startOffset, - validateCrcs, -}: { - view: DataView; - startOffset: number; - validateCrcs: boolean; -}): { record: TypedMcapRecord; usedBytes: number } | { record?: undefined; usedBytes: 0 } { - if (startOffset + /*opcode*/ 1 + /*record content length*/ 8 >= view.byteLength) { - return { usedBytes: 0 }; +// NOTE: internal function in the hot path, (de)structuring args would be wasteful, acceptable perf/clarity tradeoff +// eslint-disable-next-line @foxglove/no-boolean-parameters +export function parseRecord(reader: Reader, validateCrcs = false): TypedMcapRecord | undefined { + const RECORD_HEADER_SIZE = 1 /*opcode*/ + 8; /*record content length*/ + if (reader.bytesRemaining() < RECORD_HEADER_SIZE) { + return undefined; } - const headerReader = new Reader(view, startOffset); + const start = reader.offset; + const opcode = reader.uint8(); + const recordLength = reader.uint64(); - const opcode = headerReader.uint8(); - - const recordLength = headerReader.uint64(); if (recordLength > Number.MAX_SAFE_INTEGER) { throw new Error(`Record content length ${recordLength} is too large`); } + const recordLengthNum = Number(recordLength); - const recordEndOffset = headerReader.offset + recordLengthNum; - if (recordEndOffset > view.byteLength) { - return { usedBytes: 0 }; + + if (reader.bytesRemaining() < recordLengthNum) { + reader.offset = start; // Rewind to the start of the record + return undefined; } - if (!isKnownOpcode(opcode)) { - const data = new Uint8Array( - view.buffer, - view.byteOffset + headerReader.offset, - recordLengthNum, - ); - const record: TypedMcapRecord = { - type: "Unknown", - opcode, - data, - }; - return { record, usedBytes: recordEndOffset - startOffset }; + let result: TypedMcapRecord; + switch (opcode as Opcode) { + case Opcode.HEADER: + result = parseHeader(reader, recordLengthNum); + break; + case Opcode.FOOTER: + result = parseFooter(reader, recordLengthNum); + break; + case Opcode.SCHEMA: + result = parseSchema(reader, recordLengthNum); + break; + case Opcode.CHANNEL: + result = parseChannel(reader, recordLengthNum); + break; + case Opcode.MESSAGE: + result = parseMessage(reader, recordLengthNum); + break; + case Opcode.CHUNK: + result = parseChunk(reader, recordLengthNum); + break; + case Opcode.MESSAGE_INDEX: + result = parseMessageIndex(reader, recordLengthNum); + break; + case Opcode.CHUNK_INDEX: + result = parseChunkIndex(reader, recordLengthNum); + break; + case Opcode.ATTACHMENT: + result = parseAttachment(reader, recordLengthNum, validateCrcs); + break; + case Opcode.ATTACHMENT_INDEX: + result = parseAttachmentIndex(reader, recordLengthNum); + break; + case Opcode.STATISTICS: + result = parseStatistics(reader, recordLengthNum); + break; + case Opcode.METADATA: + result = parseMetadata(reader, recordLengthNum); + break; + case Opcode.METADATA_INDEX: + result = parseMetadataIndex(reader, recordLengthNum); + break; + case Opcode.SUMMARY_OFFSET: + result = parseSummaryOffset(reader, recordLengthNum); + break; + case Opcode.DATA_END: + result = parseDataEnd(reader, recordLengthNum); + break; + default: + result = parseUnknown(reader, recordLengthNum, opcode); + break; } - const recordView = new DataView( - view.buffer, - view.byteOffset + headerReader.offset, - recordLengthNum, + // NOTE: a bit redundant, but ensures we've advanced by the full record length + // TODO: simplify this when we explore monomorphic paths + reader.offset = start + RECORD_HEADER_SIZE + recordLengthNum; + + return result; +} + +function parseUnknown(reader: Reader, recordLength: number, opcode: number): TypedMcapRecord { + const data = reader.u8ArrayBorrow(recordLength); + return { + type: "Unknown", + opcode, + data, + }; +} + +function parseHeader(reader: Reader, recordLength: number): TypedMcapRecord { + const startOffset = reader.offset; + const profile = reader.string(); + const library = reader.string(); + reader.offset = startOffset + recordLength; + return { type: "Header", profile, library }; +} + +function parseFooter(reader: Reader, recordLength: number): TypedMcapRecord { + const startOffset = reader.offset; + const summaryStart = reader.uint64(); + const summaryOffsetStart = reader.uint64(); + const summaryCrc = reader.uint32(); + reader.offset = startOffset + recordLength; + return { + type: "Footer", + summaryStart, + summaryOffsetStart, + summaryCrc, + }; +} + +function parseSchema(reader: Reader, recordLength: number): TypedMcapRecord { + const start = reader.offset; + const id = reader.uint16(); + const name = reader.string(); + const encoding = reader.string(); + const dataLen = reader.uint32(); + const end = reader.offset; + if (recordLength - (end - start) < dataLen) { + throw new Error(`Schema data length ${dataLen} exceeds bounds of record`); + } + const data = reader.u8ArrayCopy(dataLen); + reader.offset = start + recordLength; + + return { + type: "Schema", + id, + encoding, + name, + data, + }; +} + +function parseChannel(reader: Reader, recordLength: number): TypedMcapRecord { + const startOffset = reader.offset; + const channelId = reader.uint16(); + const schemaId = reader.uint16(); + const topicName = reader.string(); + const messageEncoding = reader.string(); + const metadata = reader.map( + (r) => r.string(), + (r) => r.string(), ); - const reader = new Reader(recordView); - - switch (opcode) { - case Opcode.HEADER: { - const profile = reader.string(); - const library = reader.string(); - const record: TypedMcapRecord = { type: "Header", profile, library }; - return { record, usedBytes: recordEndOffset - startOffset }; - } + reader.offset = startOffset + recordLength; - case Opcode.FOOTER: { - const summaryStart = reader.uint64(); - const summaryOffsetStart = reader.uint64(); - const summaryCrc = reader.uint32(); - const record: TypedMcapRecord = { - type: "Footer", - summaryStart, - summaryOffsetStart, - summaryCrc, - }; - return { record, usedBytes: recordEndOffset - startOffset }; - } + return { + type: "Channel", + id: channelId, + schemaId, + topic: topicName, + messageEncoding, + metadata, + }; +} - case Opcode.SCHEMA: { - const id = reader.uint16(); - const name = reader.string(); - const encoding = reader.string(); - const dataLen = reader.uint32(); - if (reader.offset + dataLen > recordView.byteLength) { - throw new Error(`Schema data length ${dataLen} exceeds bounds of record`); - } - const data = new Uint8Array( - recordView.buffer, - recordView.byteOffset + reader.offset, - dataLen, - ).slice(); - reader.offset += dataLen; - - const record: TypedMcapRecord = { - type: "Schema", - id, - encoding, - name, - data, - }; - - return { record, usedBytes: recordEndOffset - startOffset }; - } +function parseMessage(reader: Reader, recordLength: number): TypedMcapRecord { + const MESSAGE_PREFIX_SIZE = 2 + 4 + 8 + 8; // channelId, sequence, logTime, publishTime + const channelId = reader.uint16(); + const sequence = reader.uint32(); + const logTime = reader.uint64(); + const publishTime = reader.uint64(); + const data = reader.u8ArrayCopy(recordLength - MESSAGE_PREFIX_SIZE); + return { + type: "Message", + channelId, + sequence, + logTime, + publishTime, + data, + }; +} - case Opcode.CHANNEL: { - const channelId = reader.uint16(); - const schemaId = reader.uint16(); - const topicName = reader.string(); - const messageEncoding = reader.string(); - const metadata = reader.map( - (r) => r.string(), - (r) => r.string(), - ); - - const record: TypedMcapRecord = { - type: "Channel", - id: channelId, - schemaId, - topic: topicName, - messageEncoding, - metadata, - }; - - return { record, usedBytes: recordEndOffset - startOffset }; - } +function parseChunk(reader: Reader, recordLength: number): TypedMcapRecord { + const start = reader.offset; + const startTime = reader.uint64(); + const endTime = reader.uint64(); + const uncompressedSize = reader.uint64(); + const uncompressedCrc = reader.uint32(); + const compression = reader.string(); + const recordsByteLength = Number(reader.uint64()); + const end = reader.offset; + const prefixSize = end - start; + if (recordsByteLength + prefixSize > recordLength) { + throw new Error("Chunk records length exceeds remaining record size"); + } + const records = reader.u8ArrayCopy(recordsByteLength); + reader.offset = start + recordLength; + return { + type: "Chunk", + messageStartTime: startTime, + messageEndTime: endTime, + compression, + uncompressedSize, + uncompressedCrc, + records, + }; +} - case Opcode.MESSAGE: { - const channelId = reader.uint16(); - const sequence = reader.uint32(); - const logTime = reader.uint64(); - const publishTime = reader.uint64(); - const data = new Uint8Array( - recordView.buffer, - recordView.byteOffset + reader.offset, - recordView.byteLength - reader.offset, - ).slice(); - const record: TypedMcapRecord = { - type: "Message", - channelId, - sequence, - logTime, - publishTime, - data, - }; - return { record, usedBytes: recordEndOffset - startOffset }; - } +function parseMessageIndex(reader: Reader, recordLength: number): TypedMcapRecord { + const startOffset = reader.offset; + const channelId = reader.uint16(); + const records = reader.keyValuePairs( + (r) => r.uint64(), + (r) => r.uint64(), + ); + reader.offset = startOffset + recordLength; + return { + type: "MessageIndex", + channelId, + records, + }; +} - case Opcode.CHUNK: { - const startTime = reader.uint64(); - const endTime = reader.uint64(); - const uncompressedSize = reader.uint64(); - const uncompressedCrc = reader.uint32(); - const compression = reader.string(); - const recordByteLength = Number(reader.uint64()); - if (recordByteLength + reader.offset > recordView.byteLength) { - throw new Error("Chunk records length exceeds remaining record size"); - } - const records = new Uint8Array( - recordView.buffer, - recordView.byteOffset + reader.offset, - recordByteLength, - ).slice(); - const record: TypedMcapRecord = { - type: "Chunk", - messageStartTime: startTime, - messageEndTime: endTime, - compression, - uncompressedSize, - uncompressedCrc, - records, - }; - return { record, usedBytes: recordEndOffset - startOffset }; - } +function parseChunkIndex(reader: Reader, recordLength: number): TypedMcapRecord { + const startOffset = reader.offset; + const messageStartTime = reader.uint64(); + const messageEndTime = reader.uint64(); + const chunkStartOffset = reader.uint64(); + const chunkLength = reader.uint64(); + const messageIndexOffsets = reader.map( + (r) => r.uint16(), + (r) => r.uint64(), + ); + const messageIndexLength = reader.uint64(); + const compression = reader.string(); + const compressedSize = reader.uint64(); + const uncompressedSize = reader.uint64(); + reader.offset = startOffset + recordLength; + return { + type: "ChunkIndex", + messageStartTime, + messageEndTime, + chunkStartOffset, + chunkLength, + messageIndexOffsets, + messageIndexLength, + compression, + compressedSize, + uncompressedSize, + }; +} - case Opcode.MESSAGE_INDEX: { - const channelId = reader.uint16(); - const records = reader.keyValuePairs( - (r) => r.uint64(), - (r) => r.uint64(), - ); - const record: TypedMcapRecord = { - type: "MessageIndex", - channelId, - records, - }; - return { record, usedBytes: recordEndOffset - startOffset }; - } - case Opcode.CHUNK_INDEX: { - const messageStartTime = reader.uint64(); - const messageEndTime = reader.uint64(); - const chunkStartOffset = reader.uint64(); - const chunkLength = reader.uint64(); - const messageIndexOffsets = reader.map( - (r) => r.uint16(), - (r) => r.uint64(), - ); - const messageIndexLength = reader.uint64(); - const compression = reader.string(); - const compressedSize = reader.uint64(); - const uncompressedSize = reader.uint64(); - const record: TypedMcapRecord = { - type: "ChunkIndex", - messageStartTime, - messageEndTime, - chunkStartOffset, - chunkLength, - messageIndexOffsets, - messageIndexLength, - compression, - compressedSize, - uncompressedSize, - }; - return { record, usedBytes: recordEndOffset - startOffset }; - } - case Opcode.ATTACHMENT: { - const logTime = reader.uint64(); - const createTime = reader.uint64(); - const name = reader.string(); - const mediaType = reader.string(); - const dataLen = reader.uint64(); - if (BigInt(recordView.byteOffset + reader.offset) + dataLen > Number.MAX_SAFE_INTEGER) { - throw new Error(`Attachment too large: ${dataLen}`); - } - if (reader.offset + Number(dataLen) + 4 /*crc*/ > recordView.byteLength) { - throw new Error(`Attachment data length ${dataLen} exceeds bounds of record`); - } - const data = new Uint8Array( - recordView.buffer, - recordView.byteOffset + reader.offset, - Number(dataLen), - ).slice(); - reader.offset += Number(dataLen); - const crcLength = reader.offset; - const expectedCrc = reader.uint32(); - if (validateCrcs && expectedCrc !== 0) { - const actualCrc = crc32(new DataView(recordView.buffer, recordView.byteOffset, crcLength)); - if (actualCrc !== expectedCrc) { - throw new Error( - `Attachment CRC32 mismatch: expected ${expectedCrc}, actual ${actualCrc}`, - ); - } - } - - const record: TypedMcapRecord = { - type: "Attachment", - logTime, - createTime, - name, - mediaType, - data, - }; - return { record, usedBytes: recordEndOffset - startOffset }; - } - case Opcode.ATTACHMENT_INDEX: { - const offset = reader.uint64(); - const length = reader.uint64(); - const logTime = reader.uint64(); - const createTime = reader.uint64(); - const dataSize = reader.uint64(); - const name = reader.string(); - const mediaType = reader.string(); - - const record: TypedMcapRecord = { - type: "AttachmentIndex", - offset, - length, - logTime, - createTime, - dataSize, - name, - mediaType, - }; - return { record, usedBytes: recordEndOffset - startOffset }; - } - case Opcode.STATISTICS: { - const messageCount = reader.uint64(); - const schemaCount = reader.uint16(); - const channelCount = reader.uint32(); - const attachmentCount = reader.uint32(); - const metadataCount = reader.uint32(); - const chunkCount = reader.uint32(); - const messageStartTime = reader.uint64(); - const messageEndTime = reader.uint64(); - const channelMessageCounts = reader.map( - (r) => r.uint16(), - (r) => r.uint64(), - ); - - const record: TypedMcapRecord = { - type: "Statistics", - messageCount, - schemaCount, - channelCount, - attachmentCount, - metadataCount, - chunkCount, - messageStartTime, - messageEndTime, - channelMessageCounts, - }; - return { record, usedBytes: recordEndOffset - startOffset }; - } - case Opcode.METADATA: { - const name = reader.string(); - const metadata = reader.map( - (r) => r.string(), - (r) => r.string(), - ); - const record: TypedMcapRecord = { type: "Metadata", metadata, name }; - return { record, usedBytes: recordEndOffset - startOffset }; - } - case Opcode.METADATA_INDEX: { - const offset = reader.uint64(); - const length = reader.uint64(); - const name = reader.string(); - - const record: TypedMcapRecord = { - type: "MetadataIndex", - offset, - length, - name, - }; - return { record, usedBytes: recordEndOffset - startOffset }; - } - case Opcode.SUMMARY_OFFSET: { - const groupOpcode = reader.uint8(); - const groupStart = reader.uint64(); - const groupLength = reader.uint64(); - - const record: TypedMcapRecord = { - type: "SummaryOffset", - groupOpcode, - groupStart, - groupLength, - }; - return { record, usedBytes: recordEndOffset - startOffset }; - } - case Opcode.DATA_END: { - const dataSectionCrc = reader.uint32(); - const record: TypedMcapRecord = { - type: "DataEnd", - dataSectionCrc, - }; - return { record, usedBytes: recordEndOffset - startOffset }; +function parseAttachment( + reader: Reader, + recordLength: number, + // NOTE: internal function in the hot path, (de)structuring args would be wasteful, acceptable perf/clarity tradeoff + // eslint-disable-next-line @foxglove/no-boolean-parameters + validateCrcs: boolean, +): TypedMcapRecord { + const startOffset = reader.offset; + const logTime = reader.uint64(); + const createTime = reader.uint64(); + const name = reader.string(); + const mediaType = reader.string(); + const dataLen = reader.uint64(); + // NOTE: probably not necessary, but just in case + if (BigInt(reader.offset) + dataLen > Number.MAX_SAFE_INTEGER) { + throw new Error(`Attachment too large: ${dataLen}`); + } + if (reader.offset + Number(dataLen) + 4 /*crc*/ > startOffset + recordLength) { + throw new Error(`Attachment data length ${dataLen} exceeds bounds of record`); + } + const data = reader.u8ArrayCopy(Number(dataLen)); + const crcLength = reader.offset - startOffset; + const expectedCrc = reader.uint32(); + if (validateCrcs && expectedCrc !== 0) { + reader.offset = startOffset; + const fullData = reader.u8ArrayBorrow(crcLength); + const actualCrc = crc32(fullData); + reader.offset = startOffset + crcLength + 4; + if (actualCrc !== expectedCrc) { + throw new Error(`Attachment CRC32 mismatch: expected ${expectedCrc}, actual ${actualCrc}`); } } + reader.offset = startOffset + recordLength; + + return { + type: "Attachment", + logTime, + createTime, + name, + mediaType, + data, + }; +} + +function parseAttachmentIndex(reader: Reader, recordLength: number): TypedMcapRecord { + const startOffset = reader.offset; + const offset = reader.uint64(); + const length = reader.uint64(); + const logTime = reader.uint64(); + const createTime = reader.uint64(); + const dataSize = reader.uint64(); + const name = reader.string(); + const mediaType = reader.string(); + reader.offset = startOffset + recordLength; + + return { + type: "AttachmentIndex", + offset, + length, + logTime, + createTime, + dataSize, + name, + mediaType, + }; +} + +function parseStatistics(reader: Reader, recordLength: number): TypedMcapRecord { + const startOffset = reader.offset; + const messageCount = reader.uint64(); + const schemaCount = reader.uint16(); + const channelCount = reader.uint32(); + const attachmentCount = reader.uint32(); + const metadataCount = reader.uint32(); + const chunkCount = reader.uint32(); + const messageStartTime = reader.uint64(); + const messageEndTime = reader.uint64(); + const channelMessageCounts = reader.map( + (r) => r.uint16(), + (r) => r.uint64(), + ); + reader.offset = startOffset + recordLength; + + return { + type: "Statistics", + messageCount, + schemaCount, + channelCount, + attachmentCount, + metadataCount, + chunkCount, + messageStartTime, + messageEndTime, + channelMessageCounts, + }; +} + +function parseMetadata(reader: Reader, recordLength: number): TypedMcapRecord { + const startOffset = reader.offset; + const name = reader.string(); + const metadata = reader.map( + (r) => r.string(), + (r) => r.string(), + ); + reader.offset = startOffset + recordLength; + return { type: "Metadata", metadata, name }; +} + +function parseMetadataIndex(reader: Reader, recordLength: number): TypedMcapRecord { + const startOffset = reader.offset; + const offset = reader.uint64(); + const length = reader.uint64(); + const name = reader.string(); + reader.offset = startOffset + recordLength; + + return { + type: "MetadataIndex", + offset, + length, + name, + }; +} + +function parseSummaryOffset(reader: Reader, recordLength: number): TypedMcapRecord { + const startOffset = reader.offset; + const groupOpcode = reader.uint8(); + const groupStart = reader.uint64(); + const groupLength = reader.uint64(); + reader.offset = startOffset + recordLength; + + return { + type: "SummaryOffset", + groupOpcode, + groupStart, + groupLength, + }; +} + +function parseDataEnd(reader: Reader, recordLength: number): TypedMcapRecord { + const startOffset = reader.offset; + const dataSectionCrc = reader.uint32(); + reader.offset = startOffset + recordLength; + return { + type: "DataEnd", + dataSectionCrc, + }; } From 55590de29474c8db5fcf3e23f0395bb8a0f117ab Mon Sep 17 00:00:00 2001 From: james-rms Date: Wed, 28 Aug 2024 12:27:35 +1000 Subject: [PATCH 30/44] python: add record size limit (#1225) ### Changelog - Added: the python MCAP library reader classes gained a new parameter, `record_size_limit`, defaulting to 4GiB. This allows callers to limit the size of records that their application will support. When a record is encountered with a greater length, the reader will raise an `InvalidRecordLength` exception. This limit can be removed by setting it to `None`. This helps applications avoid the issue where a corrupt MCAP can cause a `MemoryError`. ### Docs See generated python docs in the mcap.dev preview. ### Description
BeforeAfter
Fixes: #1220 --- python/mcap/mcap/__init__.py | 2 +- python/mcap/mcap/exceptions.py | 20 +++++++- python/mcap/mcap/reader.py | 55 +++++++++++++++++++--- python/mcap/mcap/stream_reader.py | 69 +++++++++++++++++----------- python/mcap/tests/test_reader.py | 76 ++++++++++++++++++++++++++++++- 5 files changed, 185 insertions(+), 37 deletions(-) diff --git a/python/mcap/mcap/__init__.py b/python/mcap/mcap/__init__.py index a82b376d2d..c68196d1cb 100644 --- a/python/mcap/mcap/__init__.py +++ b/python/mcap/mcap/__init__.py @@ -1 +1 @@ -__version__ = "1.1.1" +__version__ = "1.2.0" diff --git a/python/mcap/mcap/exceptions.py b/python/mcap/mcap/exceptions.py index e51bad2d22..024acfd7a5 100644 --- a/python/mcap/mcap/exceptions.py +++ b/python/mcap/mcap/exceptions.py @@ -1,9 +1,14 @@ +from typing import Any + +from mcap.opcode import Opcode + + class McapError(Exception): pass class InvalidMagic(McapError): - def __init__(self, bad_magic): + def __init__(self, bad_magic: Any): super().__init__(f"not a valid MCAP file, invalid magic: {bad_magic}") @@ -13,3 +18,16 @@ class DecoderNotFoundError(McapError): class EndOfFile(McapError): pass + + +class RecordLengthLimitExceeded(McapError): + def __init__(self, opcode: int, length: int, limit: int): + opcode_name = f"unknown (opcode {opcode})" + try: + opcode_name = Opcode(opcode).name + except ValueError: + # unknown opcode will trigger a ValueError + pass + super().__init__( + f"{opcode_name} record has length {length} that exceeds limit {limit}", + ) diff --git a/python/mcap/mcap/reader.py b/python/mcap/mcap/reader.py index f30c906201..96a47d8337 100644 --- a/python/mcap/mcap/reader.py +++ b/python/mcap/mcap/reader.py @@ -240,6 +240,11 @@ class SeekingReader(McapReader): :param decoder_factories: An iterable of :py:class:`~mcap.decoder.DecoderFactory` instances which can provide decoding functionality to :py:meth:`~mcap.reader.McapReader.iter_decoded_messages`. + :param record_size_limit: An upper bound to the size of MCAP records that this reader will + attempt to load in bytes, defaulting to 4 GiB. If this reader encounters a record with a + greater length, it will throw an :py:class:`~mcap.exceptions.RecordLengthLimitExceeded` + error. Setting to ``None`` removes the limit, but can allow corrupted MCAP files to trigger + a `MemoryError` exception. """ def __init__( @@ -247,12 +252,14 @@ def __init__( stream: IO[bytes], validate_crcs: bool = False, decoder_factories: Iterable[DecoderFactory] = (), + record_size_limit: Optional[int] = 4 * 2**30, ): super().__init__(decoder_factories=decoder_factories) read_magic(ReadDataStream(stream, calculate_crc=False)) self._stream = stream self._validate_crcs = validate_crcs self._summary: Optional[Summary] = None + self._record_size_limit = record_size_limit def iter_messages( self, @@ -323,7 +330,13 @@ def iter_messages( def get_header(self) -> Header: """Reads the Header record from the beginning of the MCAP file.""" self._stream.seek(0) - header = next(StreamReader(self._stream, skip_magic=False).records) + header = next( + StreamReader( + self._stream, + skip_magic=False, + record_size_limit=self._record_size_limit, + ).records + ) if not isinstance(header, Header): raise McapError( f"expected header at beginning of MCAP file, found {type(header)}" @@ -335,7 +348,13 @@ def get_summary(self) -> Optional[Summary]: if self._summary is not None: return self._summary self._stream.seek(-(FOOTER_SIZE + MAGIC_SIZE), io.SEEK_END) - footer = next(StreamReader(self._stream, skip_magic=True).records) + footer = next( + StreamReader( + self._stream, + skip_magic=True, + record_size_limit=self._record_size_limit, + ).records + ) if not isinstance(footer, Footer): raise McapError( f"expected footer at end of MCAP file, found {type(footer)}" @@ -344,7 +363,9 @@ def get_summary(self) -> Optional[Summary]: return None self._stream.seek(footer.summary_start, io.SEEK_SET) self._summary = _read_summary_from_stream_reader( - StreamReader(self._stream, skip_magic=True) + StreamReader( + self._stream, skip_magic=True, record_size_limit=self._record_size_limit + ) ) return self._summary @@ -358,7 +379,13 @@ def iter_attachments(self) -> Iterator[Attachment]: return for attachment_index in summary.attachment_indexes: self._stream.seek(attachment_index.offset) - record = next(StreamReader(self._stream, skip_magic=True).records) + record = next( + StreamReader( + self._stream, + skip_magic=True, + record_size_limit=self._record_size_limit, + ).records + ) if isinstance(record, Attachment): yield record else: @@ -374,7 +401,13 @@ def iter_metadata(self) -> Iterator[Metadata]: return for metadata_index in summary.metadata_indexes: self._stream.seek(metadata_index.offset) - record = next(StreamReader(self._stream, skip_magic=True).records) + record = next( + StreamReader( + self._stream, + skip_magic=True, + record_size_limit=self._record_size_limit, + ).records + ) if isinstance(record, Metadata): yield record else: @@ -389,6 +422,11 @@ class NonSeekingReader(McapReader): :param decoder_factories: An iterable of :py:class:`~mcap.decoder.DecoderFactory` instances which can provide decoding functionality to :py:meth:`~mcap.reader.McapReader.iter_decoded_messages`. + :param record_size_limit: An upper bound to the size of MCAP records that this reader will + attempt to load in bytes, defaulting to 4 GiB. If this reader encounters a record with a + greater length, it will throw an :py:class:`~mcap.exceptions.RecordLengthLimitExceeded` + error. Setting to ``None`` removes the limit, but can allow corrupted MCAP files to trigger + a `MemoryError` exception. """ def __init__( @@ -396,9 +434,14 @@ def __init__( stream: IO[bytes], validate_crcs: bool = False, decoder_factories: Iterable[DecoderFactory] = (), + record_size_limit: Optional[int] = 4 * 2**30, ): super().__init__(decoder_factories=decoder_factories) - self._stream_reader = StreamReader(stream, validate_crcs=validate_crcs) + self._stream_reader = StreamReader( + stream, + validate_crcs=validate_crcs, + record_size_limit=record_size_limit, + ) self._schemas: Dict[int, Schema] = {} self._channels: Dict[int, Channel] = {} self._spent: bool = False diff --git a/python/mcap/mcap/stream_reader.py b/python/mcap/mcap/stream_reader.py index eed23d9ef7..a519202434 100644 --- a/python/mcap/mcap/stream_reader.py +++ b/python/mcap/mcap/stream_reader.py @@ -7,7 +7,7 @@ import zstandard from .data_stream import ReadDataStream -from .exceptions import InvalidMagic +from .exceptions import InvalidMagic, RecordLengthLimitExceeded from .opcode import Opcode from .records import ( Attachment, @@ -98,8 +98,47 @@ def read_magic(stream: ReadDataStream) -> bool: class StreamReader: """ Reads MCAP data sequentially from an input stream. + + :param input: a file-like object for reading the source data from. + :param skip_magic: if ``True``, will not expect MCAP magic at start or end of stream. + :param emit_chunks: if ``True``, will return Chunk records directly and do not parse out the + records inside. + :param validate_crcs: if ``True``, will validate chunk and data section CRC values. + :param record_size_limit: An upper bound to the size of MCAP records that this reader will + attempt to load in bytes, defaulting to 4 GiB. If this reader encounters a record with a + greater length, it will throw an :py:class:`~mcap.exceptions.RecordLengthLimitExceeded` + error. Setting to ``None`` removes the limit, but can allow corrupted MCAP files to trigger + a `MemoryError` exception. """ + def __init__( + self, + input: Union[str, BytesIO, RawIOBase, BufferedReader, IO[bytes]], + skip_magic: bool = False, + emit_chunks: bool = False, + validate_crcs: bool = False, + record_size_limit: Optional[int] = (4 * 2**30), # 4 Gib + ): + """ + input: The input stream from which to read records. + """ + if isinstance(input, str): + self._stream = ReadDataStream( + open(input, "rb"), calculate_crc=validate_crcs + ) + elif isinstance(input, RawIOBase): + self._stream = ReadDataStream( + BufferedReader(input), calculate_crc=validate_crcs + ) + else: + self._stream = ReadDataStream(input, calculate_crc=validate_crcs) + self._footer: Optional[Footer] = None + self._skip_magic: bool = skip_magic + self._emit_chunks: bool = emit_chunks + self._validate_crcs: bool = validate_crcs + self._calculated_data_section_crc = None + self._record_size_limit = record_size_limit + @property def records(self) -> Iterator[McapRecord]: """ @@ -116,6 +155,8 @@ def records(self) -> Iterator[McapRecord]: checksum_before_read = self._stream.checksum() opcode = self._stream.read1() length = self._stream.read8() + if self._record_size_limit is not None and length > self._record_size_limit: + raise RecordLengthLimitExceeded(opcode, length, self._record_size_limit) count = self._stream.count record = self._read_record(opcode, length) if ( @@ -143,32 +184,6 @@ def records(self) -> Iterator[McapRecord]: self._footer = record read_magic(self._stream) - def __init__( - self, - input: Union[str, BytesIO, RawIOBase, BufferedReader, IO[bytes]], - skip_magic: bool = False, - emit_chunks: bool = False, - validate_crcs: bool = False, - ): - """ - input: The input stream from which to read records. - """ - if isinstance(input, str): - self._stream = ReadDataStream( - open(input, "rb"), calculate_crc=validate_crcs - ) - elif isinstance(input, RawIOBase): - self._stream = ReadDataStream( - BufferedReader(input), calculate_crc=validate_crcs - ) - else: - self._stream = ReadDataStream(input, calculate_crc=validate_crcs) - self._footer: Optional[Footer] = None - self._skip_magic: bool = skip_magic - self._emit_chunks: bool = emit_chunks - self._validate_crcs: bool = validate_crcs - self._calculated_data_section_crc = None - def _read_record(self, opcode: int, length: int) -> Optional[McapRecord]: if opcode == Opcode.ATTACHMENT: return Attachment.read(self._stream) diff --git a/python/mcap/tests/test_reader.py b/python/mcap/tests/test_reader.py index 1b4e9654d5..d34423f335 100644 --- a/python/mcap/tests/test_reader.py +++ b/python/mcap/tests/test_reader.py @@ -1,15 +1,22 @@ """tests for the McapReader implementations.""" +# cspell:words getbuffer import json import os +from io import BytesIO from pathlib import Path from typing import IO, Any, Optional, Tuple, Type, Union import pytest from mcap.decoder import DecoderFactory -from mcap.exceptions import DecoderNotFoundError, InvalidMagic +from mcap.exceptions import ( + DecoderNotFoundError, + InvalidMagic, + RecordLengthLimitExceeded, +) from mcap.reader import McapReader, NonSeekingReader, SeekingReader, make_reader from mcap.records import Channel, Message, Schema +from mcap.stream_reader import StreamReader from mcap.writer import IndexType, Writer DEMO_MCAP = ( @@ -212,7 +219,7 @@ def write_no_summary_mcap(filepath: Path): writer.add_attachment(10, 10, "my_attach", "text", b"some data") writer.add_metadata("my_meta", {"foo": "bar"}) foo_channel = writer.register_channel("/foo", "json", 0) - for i in range(200): + for _ in range(200): writer.add_message(foo_channel, 10, json.dumps({"a": 0}).encode("utf8"), 10) writer.finish() @@ -252,3 +259,68 @@ def test_detect_invalid_initial_magic(tmpdir: Path): with open(filepath, "rb") as f: with pytest.raises(InvalidMagic): NonSeekingReader(f).get_header() + + +def test_record_size_limit(): + # create a simple small MCAP + write_stream = BytesIO() + writer = Writer(write_stream) + writer.start("profile", "library") + writer.finish() + + # default stream reader can read it + stream_reader = StreamReader( + BytesIO(write_stream.getbuffer()), record_size_limit=100 + ) + records = [r for r in stream_reader.records] + assert len(records) == 10 + + # can cause "large" records to raise an error by setting a low limit + stream_reader = StreamReader( + BytesIO(write_stream.getbuffer()), record_size_limit=10 + ) + with pytest.raises( + RecordLengthLimitExceeded, + match="HEADER record has length 22 that exceeds limit 10", + ): + next(stream_reader.records) + + # default seeking reader can read it + seeking_reader = SeekingReader( + BytesIO(write_stream.getbuffer()), record_size_limit=100 + ) + seeking_reader.get_header() + seeking_reader.get_summary() + assert len([m for m in seeking_reader.iter_messages()]) == 0 + + # can cause "large" records to raise an error by setting a low limit + seeking_reader = SeekingReader( + BytesIO(write_stream.getbuffer()), record_size_limit=10 + ) + with pytest.raises( + RecordLengthLimitExceeded, + match="HEADER record has length 22 that exceeds limit 10", + ): + seeking_reader.get_header() + + with pytest.raises( + RecordLengthLimitExceeded, + match="FOOTER record has length 20 that exceeds limit 10", + ): + seeking_reader.get_summary() + + # default non-seeking reader can read it + non_seeking_reader = NonSeekingReader( + BytesIO(write_stream.getbuffer()), record_size_limit=100 + ) + non_seeking_reader.get_header() + + # can cause "large" records to raise an error by setting a low limit + non_seeking_reader = NonSeekingReader( + BytesIO(write_stream.getbuffer()), record_size_limit=10 + ) + with pytest.raises( + RecordLengthLimitExceeded, + match="HEADER record has length 22 that exceeds limit 10", + ): + non_seeking_reader.get_header() From 462981dc778876d5f99bcdfc81aac692eb06b2bf Mon Sep 17 00:00:00 2001 From: Hans-Joachim Krauch Date: Fri, 30 Aug 2024 18:52:46 +0200 Subject: [PATCH 31/44] Bump @mcap/core version (#1226) ### Changelog Performance improvements (#1212) --- typescript/core/package.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/typescript/core/package.json b/typescript/core/package.json index 2153651369..9136926bd8 100644 --- a/typescript/core/package.json +++ b/typescript/core/package.json @@ -1,6 +1,6 @@ { "name": "@mcap/core", - "version": "2.1.2", + "version": "2.1.3", "description": "MCAP file support in TypeScript", "license": "MIT", "repository": { From ed894e7b15ff7e1a5c379e0450157734517b7062 Mon Sep 17 00:00:00 2001 From: Roman Shtylman Date: Fri, 30 Aug 2024 11:29:40 -0700 Subject: [PATCH 32/44] Revert "perf(ts): reuse Reader whilst parsing records" (#1227) Reverts foxglove/mcap#1212 This broke streaming in Foxglove app. Until we can investigate we need to put the library back to a working state. --- typescript/core/src/ChunkCursor.ts | 25 +- typescript/core/src/McapIndexedReader.ts | 159 ++--- typescript/core/src/McapStreamReader.ts | 100 ++-- typescript/core/src/McapWriter.test.ts | 12 +- typescript/core/src/Reader.ts | 38 +- typescript/core/src/StreamBuffer.test.ts | 47 ++ typescript/core/src/StreamBuffer.ts | 58 ++ typescript/core/src/parse.ts | 717 +++++++++++------------ 8 files changed, 600 insertions(+), 556 deletions(-) create mode 100644 typescript/core/src/StreamBuffer.test.ts create mode 100644 typescript/core/src/StreamBuffer.ts diff --git a/typescript/core/src/ChunkCursor.ts b/typescript/core/src/ChunkCursor.ts index 2113cc2b42..3a33303010 100644 --- a/typescript/core/src/ChunkCursor.ts +++ b/typescript/core/src/ChunkCursor.ts @@ -1,4 +1,3 @@ -import Reader from "./Reader"; import { parseRecord } from "./parse"; import { sortedIndexBy } from "./sortedIndexBy"; import { sortedLastIndexBy } from "./sortedLastIndex"; @@ -137,25 +136,31 @@ export class ChunkCursor { messageIndexes.byteLength, ); - const reader = new Reader(messageIndexesView); + let offset = 0; const arrayOfMessageOffsets: [logTime: bigint, offset: bigint][][] = []; - let record; - while ((record = parseRecord(reader, true))) { - if (record.type !== "MessageIndex") { + for ( + let result; + (result = parseRecord({ view: messageIndexesView, startOffset: offset, validateCrcs: true })), + result.record; + offset += result.usedBytes + ) { + if (result.record.type !== "MessageIndex") { continue; } if ( - record.records.length === 0 || - (this.#relevantChannels && !this.#relevantChannels.has(record.channelId)) + result.record.records.length === 0 || + (this.#relevantChannels && !this.#relevantChannels.has(result.record.channelId)) ) { continue; } - arrayOfMessageOffsets.push(record.records); + arrayOfMessageOffsets.push(result.record.records); } - if (reader.bytesRemaining() !== 0) { - throw new Error(`${reader.bytesRemaining()} bytes remaining in message index section`); + if (offset !== messageIndexesView.byteLength) { + throw new Error( + `${messageIndexesView.byteLength - offset} bytes remaining in message index section`, + ); } this.#orderedMessageOffsets = arrayOfMessageOffsets diff --git a/typescript/core/src/McapIndexedReader.ts b/typescript/core/src/McapIndexedReader.ts index 51ec00044e..5955300a40 100644 --- a/typescript/core/src/McapIndexedReader.ts +++ b/typescript/core/src/McapIndexedReader.ts @@ -2,7 +2,6 @@ import { crc32, crc32Final, crc32Init, crc32Update } from "@foxglove/crc"; import Heap from "heap-js"; import { ChunkCursor } from "./ChunkCursor"; -import Reader from "./Reader"; import { MCAP_MAGIC } from "./constants"; import { parseMagic, parseRecord } from "./parse"; import { DecompressHandlers, IReadable, TypedMcapRecords } from "./types"; @@ -112,7 +111,7 @@ export class McapIndexedReader { headerPrefix.byteOffset, headerPrefix.byteLength, ); - void parseMagic(new Reader(headerPrefixView)); + void parseMagic(headerPrefixView, 0); const headerContentLength = headerPrefixView.getBigUint64( MCAP_MAGIC.length + /* Opcode.HEADER */ 1, true, @@ -122,19 +121,26 @@ export class McapIndexedReader { const headerRecord = await readable.read(BigInt(MCAP_MAGIC.length), headerReadLength); headerEndOffset = BigInt(MCAP_MAGIC.length) + headerReadLength; - const headerReader = new Reader( - new DataView(headerRecord.buffer, headerRecord.byteOffset, headerRecord.byteLength), - ); - const headerResult = parseRecord(headerReader, true); - if (headerResult?.type !== "Header") { + const headerResult = parseRecord({ + view: new DataView(headerRecord.buffer, headerRecord.byteOffset, headerRecord.byteLength), + startOffset: 0, + validateCrcs: true, + }); + if (headerResult.record?.type !== "Header") { throw new Error( - `Unable to read header at beginning of file; found ${headerResult?.type ?? "nothing"}`, + `Unable to read header at beginning of file; found ${ + headerResult.record?.type ?? "nothing" + }`, ); } - if (headerReader.bytesRemaining() !== 0) { - throw new Error(`${headerReader.bytesRemaining()} bytes remaining after parsing header`); + if (headerResult.usedBytes !== headerRecord.byteLength) { + throw new Error( + `${ + headerRecord.byteLength - headerResult.usedBytes + } bytes remaining after parsing header`, + ); } - header = headerResult; + header = headerResult.record; } function errorWithLibrary(message: string): Error { @@ -173,32 +179,33 @@ export class McapIndexedReader { } try { - void parseMagic( - new Reader(footerAndMagicView, footerAndMagicView.byteLength - MCAP_MAGIC.length), - ); + void parseMagic(footerAndMagicView, footerAndMagicView.byteLength - MCAP_MAGIC.length); } catch (error) { throw errorWithLibrary((error as Error).message); } let footer: TypedMcapRecords["Footer"]; { - const footerReader = new Reader(footerAndMagicView); - const footerRecord = parseRecord(footerReader, true); - if (footerRecord?.type !== "Footer") { + const footerResult = parseRecord({ + view: footerAndMagicView, + startOffset: 0, + validateCrcs: true, + }); + if (footerResult.record?.type !== "Footer") { throw errorWithLibrary( `Unable to read footer from end of file (offset ${footerOffset}); found ${ - footerRecord?.type ?? "nothing" + footerResult.record?.type ?? "nothing" }`, ); } - if (footerReader.bytesRemaining() !== MCAP_MAGIC.length) { + if (footerResult.usedBytes !== footerAndMagicView.byteLength - MCAP_MAGIC.length) { throw errorWithLibrary( `${ - footerReader.bytesRemaining() - MCAP_MAGIC.length + footerAndMagicView.byteLength - MCAP_MAGIC.length - footerResult.usedBytes } bytes remaining after parsing footer`, ); } - footer = footerRecord; + footer = footerResult.record; } if (footer.summaryStart === 0n) { throw errorWithLibrary("File is not indexed"); @@ -254,7 +261,6 @@ export class McapIndexedReader { dataEndAndSummarySection.byteOffset, dataEndAndSummarySection.byteLength, ); - const indexReader = new Reader(indexView); const channelsById = new Map(); const schemasById = new Map(); @@ -265,42 +271,46 @@ export class McapIndexedReader { let statistics: TypedMcapRecords["Statistics"] | undefined; let dataSectionCrc: number | undefined; - let first = true; - let result; - while ((result = parseRecord(indexReader, true))) { - if (first && result.type !== "DataEnd") { + let offset = 0; + for ( + let result; + (result = parseRecord({ view: indexView, startOffset: offset, validateCrcs: true })), + result.record; + offset += result.usedBytes + ) { + if (offset === 0 && result.record.type !== "DataEnd") { throw errorWithLibrary( - `Expected DataEnd record to precede summary section, but found ${result.type}`, + `Expected DataEnd record to precede summary section, but found ${result.record.type}`, ); } - first = false; - switch (result.type) { + switch (result.record.type) { case "Schema": - schemasById.set(result.id, result); + schemasById.set(result.record.id, result.record); break; case "Channel": - channelsById.set(result.id, result); + channelsById.set(result.record.id, result.record); break; case "ChunkIndex": - chunkIndexes.push(result); + chunkIndexes.push(result.record); break; case "AttachmentIndex": - attachmentIndexes.push(result); + attachmentIndexes.push(result.record); break; case "MetadataIndex": - metadataIndexes.push(result); + metadataIndexes.push(result.record); break; case "Statistics": if (statistics) { throw errorWithLibrary("Duplicate Statistics record"); } - statistics = result; + statistics = result.record; break; case "SummaryOffset": - summaryOffsetsByOpcode.set(result.groupOpcode, result); + summaryOffsetsByOpcode.set(result.record.groupOpcode, result.record); break; case "DataEnd": - dataSectionCrc = result.dataSectionCrc === 0 ? undefined : result.dataSectionCrc; + dataSectionCrc = + result.record.dataSectionCrc === 0 ? undefined : result.record.dataSectionCrc; break; case "Header": case "Footer": @@ -309,13 +319,13 @@ export class McapIndexedReader { case "MessageIndex": case "Attachment": case "Metadata": - throw errorWithLibrary(`${result.type} record not allowed in index section`); + throw errorWithLibrary(`${result.record.type} record not allowed in index section`); case "Unknown": break; } } - if (indexReader.bytesRemaining() !== 0) { - throw errorWithLibrary(`${indexReader.bytesRemaining()} bytes remaining in index section`); + if (offset !== indexView.byteLength) { + throw errorWithLibrary(`${indexView.byteLength - offset} bytes remaining in index section`); } return new McapIndexedReader({ @@ -385,7 +395,6 @@ export class McapIndexedReader { // cursor becomes active (i.e. when we first need to access messages from the chunk) and removed // when the cursor is removed from the heap. const chunkViewCache = new Map(); - const chunkReader = new Reader(new DataView(new ArrayBuffer(0))); for (let cursor; (cursor = chunkCursors.peek()); ) { if (!cursor.hasMessageIndexes()) { // If we encounter a chunk whose message indexes have not been loaded yet, load them and re-organize the heap. @@ -412,24 +421,27 @@ export class McapIndexedReader { `Message offset beyond chunk bounds (log time ${logTime}, offset ${offset}, chunk data length ${chunkView.byteLength}) in chunk at offset ${cursor.chunkIndex.chunkStartOffset}`, ); } - chunkReader.reset(chunkView, Number(offset)); - const record = parseRecord(chunkReader, validateCrcs ?? true); - if (!record) { + const result = parseRecord({ + view: chunkView, + startOffset: Number(offset), + validateCrcs: validateCrcs ?? true, + }); + if (!result.record) { throw this.#errorWithLibrary( `Unable to parse record at offset ${offset} in chunk at offset ${cursor.chunkIndex.chunkStartOffset}`, ); } - if (record.type !== "Message") { + if (result.record.type !== "Message") { throw this.#errorWithLibrary( - `Unexpected record type ${record.type} in message index (time ${logTime}, offset ${offset} in chunk at offset ${cursor.chunkIndex.chunkStartOffset})`, + `Unexpected record type ${result.record.type} in message index (time ${logTime}, offset ${offset} in chunk at offset ${cursor.chunkIndex.chunkStartOffset})`, ); } - if (record.logTime !== logTime) { + if (result.record.logTime !== logTime) { throw this.#errorWithLibrary( - `Message log time ${record.logTime} did not match message index entry (${logTime} at offset ${offset} in chunk at offset ${cursor.chunkIndex.chunkStartOffset})`, + `Message log time ${result.record.logTime} did not match message index entry (${logTime} at offset ${offset} in chunk at offset ${cursor.chunkIndex.chunkStartOffset})`, ); } - yield record; + yield result.record; if (cursor.hasMoreMessages()) { // There is no need to reorganize the heap when chunks are ordered and not overlapping. @@ -456,18 +468,19 @@ export class McapIndexedReader { continue; } const metadataData = await this.#readable.read(metadataIndex.offset, metadataIndex.length); - const metadataReader = new Reader( - new DataView(metadataData.buffer, metadataData.byteOffset, metadataData.byteLength), - ); - const metadataRecord = parseRecord(metadataReader, false); - if (metadataRecord?.type !== "Metadata") { + const metadataResult = parseRecord({ + view: new DataView(metadataData.buffer, metadataData.byteOffset, metadataData.byteLength), + startOffset: 0, + validateCrcs: false, + }); + if (metadataResult.record?.type !== "Metadata") { throw this.#errorWithLibrary( `Metadata data at offset ${ metadataIndex.offset - } does not point to metadata record (found ${String(metadataRecord?.type)})`, + } does not point to metadata record (found ${String(metadataResult.record?.type)})`, ); } - yield metadataRecord; + yield metadataResult.record; } } @@ -506,18 +519,23 @@ export class McapIndexedReader { attachmentIndex.offset, attachmentIndex.length, ); - const attachmentReader = new Reader( - new DataView(attachmentData.buffer, attachmentData.byteOffset, attachmentData.byteLength), - ); - const attachmentRecord = parseRecord(attachmentReader, validateCrcs ?? true); - if (attachmentRecord?.type !== "Attachment") { + const attachmentResult = parseRecord({ + view: new DataView( + attachmentData.buffer, + attachmentData.byteOffset, + attachmentData.byteLength, + ), + startOffset: 0, + validateCrcs: validateCrcs ?? true, + }); + if (attachmentResult.record?.type !== "Attachment") { throw this.#errorWithLibrary( `Attachment data at offset ${ attachmentIndex.offset - } does not point to attachment record (found ${String(attachmentRecord?.type)})`, + } does not point to attachment record (found ${String(attachmentResult.record?.type)})`, ); } - yield attachmentRecord; + yield attachmentResult.record; } } @@ -529,19 +547,20 @@ export class McapIndexedReader { chunkIndex.chunkStartOffset, chunkIndex.chunkLength, ); - const chunkReader = new Reader( - new DataView(chunkData.buffer, chunkData.byteOffset, chunkData.byteLength), - ); - const chunkRecord = parseRecord(chunkReader, options?.validateCrcs ?? true); - if (chunkRecord?.type !== "Chunk") { + const chunkResult = parseRecord({ + view: new DataView(chunkData.buffer, chunkData.byteOffset, chunkData.byteLength), + startOffset: 0, + validateCrcs: options?.validateCrcs ?? true, + }); + if (chunkResult.record?.type !== "Chunk") { throw this.#errorWithLibrary( `Chunk start offset ${ chunkIndex.chunkStartOffset - } does not point to chunk record (found ${String(chunkRecord?.type)})`, + } does not point to chunk record (found ${String(chunkResult.record?.type)})`, ); } - const chunk = chunkRecord; + const chunk = chunkResult.record; let buffer = chunk.records; if (chunk.compression !== "" && buffer.byteLength > 0) { const decompress = this.#decompressHandlers?.[chunk.compression]; diff --git a/typescript/core/src/McapStreamReader.ts b/typescript/core/src/McapStreamReader.ts index 6ed8292ce9..ddd412ef38 100644 --- a/typescript/core/src/McapStreamReader.ts +++ b/typescript/core/src/McapStreamReader.ts @@ -1,6 +1,6 @@ import { crc32 } from "@foxglove/crc"; -import Reader from "./Reader"; +import StreamBuffer from "./StreamBuffer"; import { MCAP_MAGIC } from "./constants"; import { parseMagic, parseRecord } from "./parse"; import { Channel, DecompressHandlers, McapMagic, TypedMcapRecord, TypedMcapRecords } from "./types"; @@ -50,9 +50,7 @@ type McapReaderOptions = { * ``` */ export default class McapStreamReader { - #buffer = new ArrayBuffer(MCAP_MAGIC.length * 2); - #view = new DataView(this.#buffer); - #reader = new Reader(this.#view, MCAP_MAGIC.length * 2); // Cursor starts at end of initial buffer + #buffer = new StreamBuffer(MCAP_MAGIC.length * 2); #decompressHandlers; #includeChunks; #validateCrcs; @@ -80,7 +78,7 @@ export default class McapStreamReader { /** @returns The number of bytes that have been received by `append()` but not yet parsed. */ bytesRemaining(): number { - return this.#reader.bytesRemaining(); + return this.#buffer.bytesRemaining(); } /** @@ -91,41 +89,7 @@ export default class McapStreamReader { if (this.#doneReading) { throw new Error("Already done reading"); } - this.#appendOrShift(data); - this.#reader.reset(this.#view); - } - #appendOrShift(data: Uint8Array): void { - /** Add data to the buffer, shifting existing data or reallocating if necessary. */ - const remainingBytes = this.#reader.bytesRemaining(); - const totalNeededBytes = remainingBytes + data.byteLength; - - if (totalNeededBytes <= this.#buffer.byteLength) { - // Data fits in the current buffer - if (this.#view.byteOffset + totalNeededBytes <= this.#buffer.byteLength) { - // Data fits by appending only - const array = new Uint8Array(this.#buffer, this.#view.byteOffset); - array.set(data, remainingBytes); - this.#view = new DataView(this.#buffer, this.#view.byteOffset, totalNeededBytes); - } else { - // Data fits but requires moving existing data to start of buffer - const existingData = new Uint8Array(this.#buffer, this.#view.byteOffset, remainingBytes); - const array = new Uint8Array(this.#buffer); - array.set(existingData, 0); - array.set(data, existingData.byteLength); - this.#view = new DataView(this.#buffer, 0, totalNeededBytes); - } - } else { - // New data doesn't fit, copy to a new buffer - - // Currently, the new buffer size may be smaller than the old size. For future optimizations, - // we could consider making the buffer size increase monotonically. - this.#buffer = new ArrayBuffer(totalNeededBytes * 2); - const array = new Uint8Array(this.#buffer); - const existingData = new Uint8Array(this.#view.buffer, this.#view.byteOffset, remainingBytes); - array.set(existingData, 0); - array.set(data, existingData.byteLength); - this.#view = new DataView(this.#buffer, 0, totalNeededBytes); - } + this.#buffer.append(data); } /** @@ -165,10 +129,11 @@ export default class McapStreamReader { *#read(): Generator { if (!this.#noMagicPrefix) { - let magic: McapMagic | undefined; - while (((magic = parseMagic(this.#reader)), !magic)) { + let magic: McapMagic | undefined, usedBytes: number | undefined; + while ((({ magic, usedBytes } = parseMagic(this.#buffer.view, 0)), !magic)) { yield; } + this.#buffer.consume(usedBytes); } let header: TypedMcapRecords["Header"] | undefined; @@ -179,10 +144,20 @@ export default class McapStreamReader { for (;;) { let record; - while (((record = parseRecord(this.#reader, this.#validateCrcs)), !record)) { - yield; + { + let usedBytes; + while ( + (({ record, usedBytes } = parseRecord({ + view: this.#buffer.view, + startOffset: 0, + validateCrcs: this.#validateCrcs, + })), + !record) + ) { + yield; + } + this.#buffer.consume(usedBytes); } - switch (record.type) { case "Unknown": break; @@ -231,10 +206,18 @@ export default class McapStreamReader { } } const view = new DataView(buffer.buffer, buffer.byteOffset, buffer.byteLength); - const chunkReader = new Reader(view); - let chunkRecord; - while ((chunkRecord = parseRecord(chunkReader, this.#validateCrcs))) { - switch (chunkRecord.type) { + let chunkOffset = 0; + for ( + let chunkResult; + (chunkResult = parseRecord({ + view, + startOffset: chunkOffset, + validateCrcs: this.#validateCrcs, + })), + chunkResult.record; + chunkOffset += chunkResult.usedBytes + ) { + switch (chunkResult.record.type) { case "Unknown": break; case "Header": @@ -249,31 +232,34 @@ export default class McapStreamReader { case "MetadataIndex": case "SummaryOffset": case "DataEnd": - throw errorWithLibrary(`${chunkRecord.type} record not allowed inside a chunk`); + throw errorWithLibrary( + `${chunkResult.record.type} record not allowed inside a chunk`, + ); case "Schema": case "Channel": case "Message": - yield chunkRecord; + yield chunkResult.record; break; } } - if (chunkReader.bytesRemaining() !== 0) { - throw errorWithLibrary(`${chunkReader.bytesRemaining()} bytes remaining in chunk`); + if (chunkOffset !== buffer.byteLength) { + throw errorWithLibrary(`${buffer.byteLength - chunkOffset} bytes remaining in chunk`); } break; } case "Footer": try { - let magic; - while (((magic = parseMagic(this.#reader)), !magic)) { + let magic, usedBytes; + while ((({ magic, usedBytes } = parseMagic(this.#buffer.view, 0)), !magic)) { yield; } + this.#buffer.consume(usedBytes); } catch (error) { throw errorWithLibrary((error as Error).message); } - if (this.#reader.bytesRemaining() !== 0) { + if (this.#buffer.bytesRemaining() !== 0) { throw errorWithLibrary( - `${this.#reader.bytesRemaining()} bytes remaining after MCAP footer and trailing magic`, + `${this.#buffer.bytesRemaining()} bytes remaining after MCAP footer and trailing magic`, ); } return record; diff --git a/typescript/core/src/McapWriter.test.ts b/typescript/core/src/McapWriter.test.ts index e16bc515d5..57bdcd82fc 100644 --- a/typescript/core/src/McapWriter.test.ts +++ b/typescript/core/src/McapWriter.test.ts @@ -3,7 +3,6 @@ import { crc32 } from "@foxglove/crc"; import { McapIndexedReader } from "./McapIndexedReader"; import McapStreamReader from "./McapStreamReader"; import { McapWriter } from "./McapWriter"; -import Reader from "./Reader"; import { TempBuffer } from "./TempBuffer"; import { MCAP_MAGIC, Opcode } from "./constants"; import { parseMagic, parseRecord } from "./parse"; @@ -279,12 +278,13 @@ describe("McapWriter", () => { const array = tempBuffer.get(); const view = new DataView(array.buffer, array.byteOffset, array.byteLength); - const reader = new Reader(view); const records: TypedMcapRecord[] = []; - parseMagic(reader); - let result; - while ((result = parseRecord(reader, true))) { - records.push(result); + for ( + let offset = parseMagic(view, 0).usedBytes, result; + (result = parseRecord({ view, startOffset: offset, validateCrcs: true })), result.record; + offset += result.usedBytes + ) { + records.push(result.record); } const expectedChunkData = new Uint8Array([ diff --git a/typescript/core/src/Reader.ts b/typescript/core/src/Reader.ts index d0136c648b..fcc2887237 100644 --- a/typescript/core/src/Reader.ts +++ b/typescript/core/src/Reader.ts @@ -7,27 +7,13 @@ const textDecoder = new TextDecoder(); export default class Reader { #view: DataView; - #viewU8: Uint8Array; offset: number; constructor(view: DataView, offset = 0) { this.#view = view; - this.#viewU8 = new Uint8Array(view.buffer, view.byteOffset, view.byteLength); this.offset = offset; } - // Should be ~identical to the constructor, it allows us to reinitialize the reader when - // the view changes, without creating a new instance, avoiding allocation / GC overhead - reset(view: DataView, offset = 0): void { - this.#view = view; - this.#viewU8 = new Uint8Array(view.buffer, view.byteOffset, view.byteLength); - this.offset = offset; - } - - bytesRemaining(): number { - return this.#viewU8.length - this.offset; - } - uint8(): number { const value = this.#view.getUint8(this.offset); this.offset += 1; @@ -54,12 +40,14 @@ export default class Reader { string(): string { const length = this.uint32(); - if (length === 0) { - return ""; - } else if (length > this.bytesRemaining()) { + if (this.offset + length > this.#view.byteLength) { throw new Error(`String length ${length} exceeds bounds of buffer`); } - return textDecoder.decode(this.u8ArrayBorrow(length)); + const value = textDecoder.decode( + new Uint8Array(this.#view.buffer, this.#view.byteOffset + this.offset, length), + ); + this.offset += length; + return value; } keyValuePairs(readKey: (reader: Reader) => K, readValue: (reader: Reader) => V): [K, V][] { @@ -115,18 +103,4 @@ export default class Reader { } return result; } - - // Read a borrowed Uint8Array, useful temp references or borrow semantics - u8ArrayBorrow(length: number): Uint8Array { - const result = this.#viewU8.subarray(this.offset, this.offset + length); - this.offset += length; - return result; - } - - // Read a copied Uint8Array from the underlying buffer, use when you need to keep the data around - u8ArrayCopy(length: number): Uint8Array { - const result = this.#viewU8.slice(this.offset, this.offset + length); - this.offset += length; - return result; - } } diff --git a/typescript/core/src/StreamBuffer.test.ts b/typescript/core/src/StreamBuffer.test.ts new file mode 100644 index 0000000000..a45175b3e3 --- /dev/null +++ b/typescript/core/src/StreamBuffer.test.ts @@ -0,0 +1,47 @@ +import StreamBuffer from "./StreamBuffer"; + +function toArray(view: DataView) { + return new Uint8Array(view.buffer, view.byteOffset, view.byteLength); +} + +describe("ByteStorage", () => { + it("handles basic append and consume", () => { + const buffer = new StreamBuffer(); + expect(buffer.bytesRemaining()).toBe(0); + + buffer.append(new Uint8Array([1, 2, 3])); + expect(buffer.bytesRemaining()).toBe(3); + expect(() => { + buffer.consume(4); + }).toThrow(); + + expect(toArray(buffer.view)).toEqual(new Uint8Array([1, 2, 3])); + buffer.consume(3); + expect(buffer.bytesRemaining()).toBe(0); + }); + + it("handles partial consume", () => { + const buffer = new StreamBuffer(); + + buffer.append(new Uint8Array([1, 2, 3, 4, 5])); + expect(buffer.bytesRemaining()).toBe(5); + buffer.consume(2); + expect(buffer.bytesRemaining()).toBe(3); + + expect(toArray(buffer.view)).toEqual(new Uint8Array([3, 4, 5])); + buffer.consume(3); + expect(buffer.bytesRemaining()).toBe(0); + }); + + it("reuses buffer within allocated capacity", () => { + const buffer = new StreamBuffer(5); + const rawBuffer = buffer.view.buffer; + buffer.append(new Uint8Array([1, 2])); + expect(buffer.view.buffer).toBe(rawBuffer); + buffer.append(new Uint8Array([3, 4, 5])); + expect(buffer.view.buffer).toBe(rawBuffer); + buffer.append(new Uint8Array([6, 7])); + expect(buffer.view.buffer).not.toBe(rawBuffer); + expect(toArray(buffer.view)).toEqual(new Uint8Array([1, 2, 3, 4, 5, 6, 7])); + }); +}); diff --git a/typescript/core/src/StreamBuffer.ts b/typescript/core/src/StreamBuffer.ts new file mode 100644 index 0000000000..98eaa785d5 --- /dev/null +++ b/typescript/core/src/StreamBuffer.ts @@ -0,0 +1,58 @@ +/** + * A growable buffer for use when processing a stream of data. + */ +export default class StreamBuffer { + #buffer: ArrayBuffer; + public view: DataView; + + constructor(initialCapacity = 0) { + this.#buffer = new ArrayBuffer(initialCapacity); + this.view = new DataView(this.#buffer, 0, 0); + } + + bytesRemaining(): number { + return this.view.byteLength; + } + + /** Mark some data as consumed, so the memory can be reused when new data is appended. */ + consume(count: number): void { + this.view = new DataView( + this.#buffer, + this.view.byteOffset + count, + this.view.byteLength - count, + ); + } + + /** Add data to the buffer, shifting existing data or reallocating if necessary. */ + append(data: Uint8Array): void { + if (this.view.byteOffset + this.view.byteLength + data.byteLength <= this.#buffer.byteLength) { + // Data fits by appending only + const array = new Uint8Array(this.view.buffer, this.view.byteOffset); + array.set(data, this.view.byteLength); + this.view = new DataView( + this.#buffer, + this.view.byteOffset, + this.view.byteLength + data.byteLength, + ); + } else if (this.view.byteLength + data.byteLength <= this.#buffer.byteLength) { + // Data fits in allocated buffer but requires moving existing data to start of buffer + const oldData = new Uint8Array(this.#buffer, this.view.byteOffset, this.view.byteLength); + const array = new Uint8Array(this.#buffer); + array.set(oldData, 0); + array.set(data, oldData.byteLength); + this.view = new DataView(this.#buffer, 0, this.view.byteLength + data.byteLength); + } else { + // New data doesn't fit, copy to a new buffer + + // Currently, the new buffer size may be smaller than the old size. For future optimizations, + // we could consider making the buffer size increase monotonically. + + const oldData = new Uint8Array(this.#buffer, this.view.byteOffset, this.view.byteLength); + this.#buffer = new ArrayBuffer((this.view.byteLength + data.byteLength) * 2); + const array = new Uint8Array(this.#buffer); + array.set(oldData, 0); + array.set(data, oldData.byteLength); + this.view = new DataView(this.#buffer, 0, this.view.byteLength + data.byteLength); + } + } +} diff --git a/typescript/core/src/parse.ts b/typescript/core/src/parse.ts index 95d0105750..7f2fe80285 100644 --- a/typescript/core/src/parse.ts +++ b/typescript/core/src/parse.ts @@ -1,419 +1,374 @@ import { crc32 } from "@foxglove/crc"; import Reader from "./Reader"; -import { MCAP_MAGIC, Opcode } from "./constants"; +import { isKnownOpcode, MCAP_MAGIC, Opcode } from "./constants"; import { McapMagic, TypedMcapRecord } from "./types"; /** * Parse a MCAP magic string at `startOffset` in `view`. */ -export function parseMagic(reader: Reader): McapMagic | undefined { - if (reader.bytesRemaining() < MCAP_MAGIC.length) { - return undefined; +export function parseMagic( + view: DataView, + startOffset: number, +): { magic: McapMagic; usedBytes: number } | { magic?: undefined; usedBytes: 0 } { + if (startOffset + MCAP_MAGIC.length > view.byteLength) { + return { usedBytes: 0 }; } - const magic = reader.u8ArrayBorrow(MCAP_MAGIC.length); - if (!MCAP_MAGIC.every((val, i) => val === magic[i])) { + if (!MCAP_MAGIC.every((val, i) => val === view.getUint8(startOffset + i))) { throw new Error( `Expected MCAP magic '${MCAP_MAGIC.map((val) => val.toString(16).padStart(2, "0")).join( " ", - )}', found '${Array.from(magic, (_, i) => magic[i]!.toString(16).padStart(2, "0")).join( - " ", - )}'`, + )}', found '${Array.from(MCAP_MAGIC, (_, i) => + view + .getUint8(startOffset + i) + .toString(16) + .padStart(2, "0"), + ).join(" ")}'`, ); } - return { specVersion: "0" }; + return { + magic: { specVersion: "0" }, + usedBytes: MCAP_MAGIC.length, + }; } /** - * Parse a MCAP record from the given reader + * Parse a MCAP record beginning at `startOffset` in `view`. */ -// NOTE: internal function in the hot path, (de)structuring args would be wasteful, acceptable perf/clarity tradeoff -// eslint-disable-next-line @foxglove/no-boolean-parameters -export function parseRecord(reader: Reader, validateCrcs = false): TypedMcapRecord | undefined { - const RECORD_HEADER_SIZE = 1 /*opcode*/ + 8; /*record content length*/ - if (reader.bytesRemaining() < RECORD_HEADER_SIZE) { - return undefined; +export function parseRecord({ + view, + startOffset, + validateCrcs, +}: { + view: DataView; + startOffset: number; + validateCrcs: boolean; +}): { record: TypedMcapRecord; usedBytes: number } | { record?: undefined; usedBytes: 0 } { + if (startOffset + /*opcode*/ 1 + /*record content length*/ 8 >= view.byteLength) { + return { usedBytes: 0 }; } - const start = reader.offset; - const opcode = reader.uint8(); - const recordLength = reader.uint64(); + const headerReader = new Reader(view, startOffset); + const opcode = headerReader.uint8(); + + const recordLength = headerReader.uint64(); if (recordLength > Number.MAX_SAFE_INTEGER) { throw new Error(`Record content length ${recordLength} is too large`); } - const recordLengthNum = Number(recordLength); - - if (reader.bytesRemaining() < recordLengthNum) { - reader.offset = start; // Rewind to the start of the record - return undefined; + const recordEndOffset = headerReader.offset + recordLengthNum; + if (recordEndOffset > view.byteLength) { + return { usedBytes: 0 }; } - let result: TypedMcapRecord; - switch (opcode as Opcode) { - case Opcode.HEADER: - result = parseHeader(reader, recordLengthNum); - break; - case Opcode.FOOTER: - result = parseFooter(reader, recordLengthNum); - break; - case Opcode.SCHEMA: - result = parseSchema(reader, recordLengthNum); - break; - case Opcode.CHANNEL: - result = parseChannel(reader, recordLengthNum); - break; - case Opcode.MESSAGE: - result = parseMessage(reader, recordLengthNum); - break; - case Opcode.CHUNK: - result = parseChunk(reader, recordLengthNum); - break; - case Opcode.MESSAGE_INDEX: - result = parseMessageIndex(reader, recordLengthNum); - break; - case Opcode.CHUNK_INDEX: - result = parseChunkIndex(reader, recordLengthNum); - break; - case Opcode.ATTACHMENT: - result = parseAttachment(reader, recordLengthNum, validateCrcs); - break; - case Opcode.ATTACHMENT_INDEX: - result = parseAttachmentIndex(reader, recordLengthNum); - break; - case Opcode.STATISTICS: - result = parseStatistics(reader, recordLengthNum); - break; - case Opcode.METADATA: - result = parseMetadata(reader, recordLengthNum); - break; - case Opcode.METADATA_INDEX: - result = parseMetadataIndex(reader, recordLengthNum); - break; - case Opcode.SUMMARY_OFFSET: - result = parseSummaryOffset(reader, recordLengthNum); - break; - case Opcode.DATA_END: - result = parseDataEnd(reader, recordLengthNum); - break; - default: - result = parseUnknown(reader, recordLengthNum, opcode); - break; - } - - // NOTE: a bit redundant, but ensures we've advanced by the full record length - // TODO: simplify this when we explore monomorphic paths - reader.offset = start + RECORD_HEADER_SIZE + recordLengthNum; - - return result; -} - -function parseUnknown(reader: Reader, recordLength: number, opcode: number): TypedMcapRecord { - const data = reader.u8ArrayBorrow(recordLength); - return { - type: "Unknown", - opcode, - data, - }; -} - -function parseHeader(reader: Reader, recordLength: number): TypedMcapRecord { - const startOffset = reader.offset; - const profile = reader.string(); - const library = reader.string(); - reader.offset = startOffset + recordLength; - return { type: "Header", profile, library }; -} - -function parseFooter(reader: Reader, recordLength: number): TypedMcapRecord { - const startOffset = reader.offset; - const summaryStart = reader.uint64(); - const summaryOffsetStart = reader.uint64(); - const summaryCrc = reader.uint32(); - reader.offset = startOffset + recordLength; - return { - type: "Footer", - summaryStart, - summaryOffsetStart, - summaryCrc, - }; -} - -function parseSchema(reader: Reader, recordLength: number): TypedMcapRecord { - const start = reader.offset; - const id = reader.uint16(); - const name = reader.string(); - const encoding = reader.string(); - const dataLen = reader.uint32(); - const end = reader.offset; - if (recordLength - (end - start) < dataLen) { - throw new Error(`Schema data length ${dataLen} exceeds bounds of record`); + if (!isKnownOpcode(opcode)) { + const data = new Uint8Array( + view.buffer, + view.byteOffset + headerReader.offset, + recordLengthNum, + ); + const record: TypedMcapRecord = { + type: "Unknown", + opcode, + data, + }; + return { record, usedBytes: recordEndOffset - startOffset }; } - const data = reader.u8ArrayCopy(dataLen); - reader.offset = start + recordLength; - - return { - type: "Schema", - id, - encoding, - name, - data, - }; -} -function parseChannel(reader: Reader, recordLength: number): TypedMcapRecord { - const startOffset = reader.offset; - const channelId = reader.uint16(); - const schemaId = reader.uint16(); - const topicName = reader.string(); - const messageEncoding = reader.string(); - const metadata = reader.map( - (r) => r.string(), - (r) => r.string(), + const recordView = new DataView( + view.buffer, + view.byteOffset + headerReader.offset, + recordLengthNum, ); - reader.offset = startOffset + recordLength; + const reader = new Reader(recordView); + + switch (opcode) { + case Opcode.HEADER: { + const profile = reader.string(); + const library = reader.string(); + const record: TypedMcapRecord = { type: "Header", profile, library }; + return { record, usedBytes: recordEndOffset - startOffset }; + } - return { - type: "Channel", - id: channelId, - schemaId, - topic: topicName, - messageEncoding, - metadata, - }; -} + case Opcode.FOOTER: { + const summaryStart = reader.uint64(); + const summaryOffsetStart = reader.uint64(); + const summaryCrc = reader.uint32(); + const record: TypedMcapRecord = { + type: "Footer", + summaryStart, + summaryOffsetStart, + summaryCrc, + }; + return { record, usedBytes: recordEndOffset - startOffset }; + } -function parseMessage(reader: Reader, recordLength: number): TypedMcapRecord { - const MESSAGE_PREFIX_SIZE = 2 + 4 + 8 + 8; // channelId, sequence, logTime, publishTime - const channelId = reader.uint16(); - const sequence = reader.uint32(); - const logTime = reader.uint64(); - const publishTime = reader.uint64(); - const data = reader.u8ArrayCopy(recordLength - MESSAGE_PREFIX_SIZE); - return { - type: "Message", - channelId, - sequence, - logTime, - publishTime, - data, - }; -} + case Opcode.SCHEMA: { + const id = reader.uint16(); + const name = reader.string(); + const encoding = reader.string(); + const dataLen = reader.uint32(); + if (reader.offset + dataLen > recordView.byteLength) { + throw new Error(`Schema data length ${dataLen} exceeds bounds of record`); + } + const data = new Uint8Array( + recordView.buffer, + recordView.byteOffset + reader.offset, + dataLen, + ).slice(); + reader.offset += dataLen; + + const record: TypedMcapRecord = { + type: "Schema", + id, + encoding, + name, + data, + }; + + return { record, usedBytes: recordEndOffset - startOffset }; + } -function parseChunk(reader: Reader, recordLength: number): TypedMcapRecord { - const start = reader.offset; - const startTime = reader.uint64(); - const endTime = reader.uint64(); - const uncompressedSize = reader.uint64(); - const uncompressedCrc = reader.uint32(); - const compression = reader.string(); - const recordsByteLength = Number(reader.uint64()); - const end = reader.offset; - const prefixSize = end - start; - if (recordsByteLength + prefixSize > recordLength) { - throw new Error("Chunk records length exceeds remaining record size"); - } - const records = reader.u8ArrayCopy(recordsByteLength); - reader.offset = start + recordLength; - return { - type: "Chunk", - messageStartTime: startTime, - messageEndTime: endTime, - compression, - uncompressedSize, - uncompressedCrc, - records, - }; -} + case Opcode.CHANNEL: { + const channelId = reader.uint16(); + const schemaId = reader.uint16(); + const topicName = reader.string(); + const messageEncoding = reader.string(); + const metadata = reader.map( + (r) => r.string(), + (r) => r.string(), + ); + + const record: TypedMcapRecord = { + type: "Channel", + id: channelId, + schemaId, + topic: topicName, + messageEncoding, + metadata, + }; + + return { record, usedBytes: recordEndOffset - startOffset }; + } -function parseMessageIndex(reader: Reader, recordLength: number): TypedMcapRecord { - const startOffset = reader.offset; - const channelId = reader.uint16(); - const records = reader.keyValuePairs( - (r) => r.uint64(), - (r) => r.uint64(), - ); - reader.offset = startOffset + recordLength; - return { - type: "MessageIndex", - channelId, - records, - }; -} + case Opcode.MESSAGE: { + const channelId = reader.uint16(); + const sequence = reader.uint32(); + const logTime = reader.uint64(); + const publishTime = reader.uint64(); + const data = new Uint8Array( + recordView.buffer, + recordView.byteOffset + reader.offset, + recordView.byteLength - reader.offset, + ).slice(); + const record: TypedMcapRecord = { + type: "Message", + channelId, + sequence, + logTime, + publishTime, + data, + }; + return { record, usedBytes: recordEndOffset - startOffset }; + } -function parseChunkIndex(reader: Reader, recordLength: number): TypedMcapRecord { - const startOffset = reader.offset; - const messageStartTime = reader.uint64(); - const messageEndTime = reader.uint64(); - const chunkStartOffset = reader.uint64(); - const chunkLength = reader.uint64(); - const messageIndexOffsets = reader.map( - (r) => r.uint16(), - (r) => r.uint64(), - ); - const messageIndexLength = reader.uint64(); - const compression = reader.string(); - const compressedSize = reader.uint64(); - const uncompressedSize = reader.uint64(); - reader.offset = startOffset + recordLength; - return { - type: "ChunkIndex", - messageStartTime, - messageEndTime, - chunkStartOffset, - chunkLength, - messageIndexOffsets, - messageIndexLength, - compression, - compressedSize, - uncompressedSize, - }; -} + case Opcode.CHUNK: { + const startTime = reader.uint64(); + const endTime = reader.uint64(); + const uncompressedSize = reader.uint64(); + const uncompressedCrc = reader.uint32(); + const compression = reader.string(); + const recordByteLength = Number(reader.uint64()); + if (recordByteLength + reader.offset > recordView.byteLength) { + throw new Error("Chunk records length exceeds remaining record size"); + } + const records = new Uint8Array( + recordView.buffer, + recordView.byteOffset + reader.offset, + recordByteLength, + ).slice(); + const record: TypedMcapRecord = { + type: "Chunk", + messageStartTime: startTime, + messageEndTime: endTime, + compression, + uncompressedSize, + uncompressedCrc, + records, + }; + return { record, usedBytes: recordEndOffset - startOffset }; + } -function parseAttachment( - reader: Reader, - recordLength: number, - // NOTE: internal function in the hot path, (de)structuring args would be wasteful, acceptable perf/clarity tradeoff - // eslint-disable-next-line @foxglove/no-boolean-parameters - validateCrcs: boolean, -): TypedMcapRecord { - const startOffset = reader.offset; - const logTime = reader.uint64(); - const createTime = reader.uint64(); - const name = reader.string(); - const mediaType = reader.string(); - const dataLen = reader.uint64(); - // NOTE: probably not necessary, but just in case - if (BigInt(reader.offset) + dataLen > Number.MAX_SAFE_INTEGER) { - throw new Error(`Attachment too large: ${dataLen}`); - } - if (reader.offset + Number(dataLen) + 4 /*crc*/ > startOffset + recordLength) { - throw new Error(`Attachment data length ${dataLen} exceeds bounds of record`); - } - const data = reader.u8ArrayCopy(Number(dataLen)); - const crcLength = reader.offset - startOffset; - const expectedCrc = reader.uint32(); - if (validateCrcs && expectedCrc !== 0) { - reader.offset = startOffset; - const fullData = reader.u8ArrayBorrow(crcLength); - const actualCrc = crc32(fullData); - reader.offset = startOffset + crcLength + 4; - if (actualCrc !== expectedCrc) { - throw new Error(`Attachment CRC32 mismatch: expected ${expectedCrc}, actual ${actualCrc}`); + case Opcode.MESSAGE_INDEX: { + const channelId = reader.uint16(); + const records = reader.keyValuePairs( + (r) => r.uint64(), + (r) => r.uint64(), + ); + const record: TypedMcapRecord = { + type: "MessageIndex", + channelId, + records, + }; + return { record, usedBytes: recordEndOffset - startOffset }; + } + case Opcode.CHUNK_INDEX: { + const messageStartTime = reader.uint64(); + const messageEndTime = reader.uint64(); + const chunkStartOffset = reader.uint64(); + const chunkLength = reader.uint64(); + const messageIndexOffsets = reader.map( + (r) => r.uint16(), + (r) => r.uint64(), + ); + const messageIndexLength = reader.uint64(); + const compression = reader.string(); + const compressedSize = reader.uint64(); + const uncompressedSize = reader.uint64(); + const record: TypedMcapRecord = { + type: "ChunkIndex", + messageStartTime, + messageEndTime, + chunkStartOffset, + chunkLength, + messageIndexOffsets, + messageIndexLength, + compression, + compressedSize, + uncompressedSize, + }; + return { record, usedBytes: recordEndOffset - startOffset }; + } + case Opcode.ATTACHMENT: { + const logTime = reader.uint64(); + const createTime = reader.uint64(); + const name = reader.string(); + const mediaType = reader.string(); + const dataLen = reader.uint64(); + if (BigInt(recordView.byteOffset + reader.offset) + dataLen > Number.MAX_SAFE_INTEGER) { + throw new Error(`Attachment too large: ${dataLen}`); + } + if (reader.offset + Number(dataLen) + 4 /*crc*/ > recordView.byteLength) { + throw new Error(`Attachment data length ${dataLen} exceeds bounds of record`); + } + const data = new Uint8Array( + recordView.buffer, + recordView.byteOffset + reader.offset, + Number(dataLen), + ).slice(); + reader.offset += Number(dataLen); + const crcLength = reader.offset; + const expectedCrc = reader.uint32(); + if (validateCrcs && expectedCrc !== 0) { + const actualCrc = crc32(new DataView(recordView.buffer, recordView.byteOffset, crcLength)); + if (actualCrc !== expectedCrc) { + throw new Error( + `Attachment CRC32 mismatch: expected ${expectedCrc}, actual ${actualCrc}`, + ); + } + } + + const record: TypedMcapRecord = { + type: "Attachment", + logTime, + createTime, + name, + mediaType, + data, + }; + return { record, usedBytes: recordEndOffset - startOffset }; + } + case Opcode.ATTACHMENT_INDEX: { + const offset = reader.uint64(); + const length = reader.uint64(); + const logTime = reader.uint64(); + const createTime = reader.uint64(); + const dataSize = reader.uint64(); + const name = reader.string(); + const mediaType = reader.string(); + + const record: TypedMcapRecord = { + type: "AttachmentIndex", + offset, + length, + logTime, + createTime, + dataSize, + name, + mediaType, + }; + return { record, usedBytes: recordEndOffset - startOffset }; + } + case Opcode.STATISTICS: { + const messageCount = reader.uint64(); + const schemaCount = reader.uint16(); + const channelCount = reader.uint32(); + const attachmentCount = reader.uint32(); + const metadataCount = reader.uint32(); + const chunkCount = reader.uint32(); + const messageStartTime = reader.uint64(); + const messageEndTime = reader.uint64(); + const channelMessageCounts = reader.map( + (r) => r.uint16(), + (r) => r.uint64(), + ); + + const record: TypedMcapRecord = { + type: "Statistics", + messageCount, + schemaCount, + channelCount, + attachmentCount, + metadataCount, + chunkCount, + messageStartTime, + messageEndTime, + channelMessageCounts, + }; + return { record, usedBytes: recordEndOffset - startOffset }; + } + case Opcode.METADATA: { + const name = reader.string(); + const metadata = reader.map( + (r) => r.string(), + (r) => r.string(), + ); + const record: TypedMcapRecord = { type: "Metadata", metadata, name }; + return { record, usedBytes: recordEndOffset - startOffset }; + } + case Opcode.METADATA_INDEX: { + const offset = reader.uint64(); + const length = reader.uint64(); + const name = reader.string(); + + const record: TypedMcapRecord = { + type: "MetadataIndex", + offset, + length, + name, + }; + return { record, usedBytes: recordEndOffset - startOffset }; + } + case Opcode.SUMMARY_OFFSET: { + const groupOpcode = reader.uint8(); + const groupStart = reader.uint64(); + const groupLength = reader.uint64(); + + const record: TypedMcapRecord = { + type: "SummaryOffset", + groupOpcode, + groupStart, + groupLength, + }; + return { record, usedBytes: recordEndOffset - startOffset }; + } + case Opcode.DATA_END: { + const dataSectionCrc = reader.uint32(); + const record: TypedMcapRecord = { + type: "DataEnd", + dataSectionCrc, + }; + return { record, usedBytes: recordEndOffset - startOffset }; } } - reader.offset = startOffset + recordLength; - - return { - type: "Attachment", - logTime, - createTime, - name, - mediaType, - data, - }; -} - -function parseAttachmentIndex(reader: Reader, recordLength: number): TypedMcapRecord { - const startOffset = reader.offset; - const offset = reader.uint64(); - const length = reader.uint64(); - const logTime = reader.uint64(); - const createTime = reader.uint64(); - const dataSize = reader.uint64(); - const name = reader.string(); - const mediaType = reader.string(); - reader.offset = startOffset + recordLength; - - return { - type: "AttachmentIndex", - offset, - length, - logTime, - createTime, - dataSize, - name, - mediaType, - }; -} - -function parseStatistics(reader: Reader, recordLength: number): TypedMcapRecord { - const startOffset = reader.offset; - const messageCount = reader.uint64(); - const schemaCount = reader.uint16(); - const channelCount = reader.uint32(); - const attachmentCount = reader.uint32(); - const metadataCount = reader.uint32(); - const chunkCount = reader.uint32(); - const messageStartTime = reader.uint64(); - const messageEndTime = reader.uint64(); - const channelMessageCounts = reader.map( - (r) => r.uint16(), - (r) => r.uint64(), - ); - reader.offset = startOffset + recordLength; - - return { - type: "Statistics", - messageCount, - schemaCount, - channelCount, - attachmentCount, - metadataCount, - chunkCount, - messageStartTime, - messageEndTime, - channelMessageCounts, - }; -} - -function parseMetadata(reader: Reader, recordLength: number): TypedMcapRecord { - const startOffset = reader.offset; - const name = reader.string(); - const metadata = reader.map( - (r) => r.string(), - (r) => r.string(), - ); - reader.offset = startOffset + recordLength; - return { type: "Metadata", metadata, name }; -} - -function parseMetadataIndex(reader: Reader, recordLength: number): TypedMcapRecord { - const startOffset = reader.offset; - const offset = reader.uint64(); - const length = reader.uint64(); - const name = reader.string(); - reader.offset = startOffset + recordLength; - - return { - type: "MetadataIndex", - offset, - length, - name, - }; -} - -function parseSummaryOffset(reader: Reader, recordLength: number): TypedMcapRecord { - const startOffset = reader.offset; - const groupOpcode = reader.uint8(); - const groupStart = reader.uint64(); - const groupLength = reader.uint64(); - reader.offset = startOffset + recordLength; - - return { - type: "SummaryOffset", - groupOpcode, - groupStart, - groupLength, - }; -} - -function parseDataEnd(reader: Reader, recordLength: number): TypedMcapRecord { - const startOffset = reader.offset; - const dataSectionCrc = reader.uint32(); - reader.offset = startOffset + recordLength; - return { - type: "DataEnd", - dataSectionCrc, - }; } From 1a5cc293325f44d64de2f5a0914c68ca4ce91f85 Mon Sep 17 00:00:00 2001 From: Roman Shtylman Date: Fri, 30 Aug 2024 11:34:21 -0700 Subject: [PATCH 33/44] Bump @mcap/core version (#1228) ### Changelog Revert of performance improvements which broke stream reading. ### Docs None ### Description We needed to revert performance improvements https://github.com/foxglove/mcap/pull/1227 which broke stream reading. --- typescript/core/package.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/typescript/core/package.json b/typescript/core/package.json index 9136926bd8..e02ee9523a 100644 --- a/typescript/core/package.json +++ b/typescript/core/package.json @@ -1,6 +1,6 @@ { "name": "@mcap/core", - "version": "2.1.3", + "version": "2.1.4", "description": "MCAP file support in TypeScript", "license": "MIT", "repository": { From a7aac1359554199265001585e0bbc11eab900725 Mon Sep 17 00:00:00 2001 From: Bahram Banisadr Date: Tue, 3 Sep 2024 18:28:36 -0700 Subject: [PATCH 34/44] Updating references & links from slack to discord (#1232) ### Changelog Updating references from community slack to community discord. ### Docs None ### Description * Links updated to point to https://foxglove.dev/chat * "Slack channel" etc. references updated to "Discord community" etc. --- .github/ISSUE_TEMPLATE/config.yml | 4 ++-- python/mcap-protobuf-support/README.md | 2 +- python/mcap-ros1-support/README.md | 2 +- python/mcap-ros2-support/README.md | 2 +- typescript/browser/README.md | 2 +- typescript/core/README.md | 2 +- typescript/nodejs/README.md | 2 +- typescript/support/README.md | 2 +- website/docusaurus.config.js | 8 ++++---- 9 files changed, 13 insertions(+), 13 deletions(-) diff --git a/.github/ISSUE_TEMPLATE/config.yml b/.github/ISSUE_TEMPLATE/config.yml index ad371798fd..c08369020c 100644 --- a/.github/ISSUE_TEMPLATE/config.yml +++ b/.github/ISSUE_TEMPLATE/config.yml @@ -10,5 +10,5 @@ contact_links: url: https://robotics.stackexchange.com/questions/ask about: Get help from the robotics community - name: "💬 Live chat" - url: https://foxglove.dev/slack - about: Join the discussion in our Slack community + url: https://foxglove.dev/chat + about: Join the discussion in our Discord community diff --git a/python/mcap-protobuf-support/README.md b/python/mcap-protobuf-support/README.md index 9d89c0fa8b..9fd4a1a7c6 100644 --- a/python/mcap-protobuf-support/README.md +++ b/python/mcap-protobuf-support/README.md @@ -35,4 +35,4 @@ pipenv run python point_cloud_example.py output.mcap ## Stay in touch -Join our [Slack channel](https://foxglove.dev/slack) to ask questions, share feedback, and stay up to date on what our team is working on. +Join our [Discord community](https://foxglove.dev/chat) to ask questions, share feedback, and stay up to date on what our team is working on. diff --git a/python/mcap-ros1-support/README.md b/python/mcap-ros1-support/README.md index 48088f23e5..a7202c5b01 100644 --- a/python/mcap-ros1-support/README.md +++ b/python/mcap-ros1-support/README.md @@ -36,5 +36,5 @@ ros_writer.finish() ## Stay in touch -Join our [Slack channel](https://foxglove.dev/slack) to ask questions, +Join our [Discord community](https://foxglove.dev/chat) to ask questions, share feedback, and stay up to date on what our team is working on. diff --git a/python/mcap-ros2-support/README.md b/python/mcap-ros2-support/README.md index 28d11a2d25..2184acd90a 100644 --- a/python/mcap-ros2-support/README.md +++ b/python/mcap-ros2-support/README.md @@ -22,5 +22,5 @@ for msg in read_ros2_messages("my_data.mcap"): ## Stay in touch -Join our [Slack channel](https://foxglove.dev/slack) to ask questions, +Join our [Discord community](https://foxglove.dev/chat) to ask questions, share feedback, and stay up to date on what our team is working on. diff --git a/typescript/browser/README.md b/typescript/browser/README.md index d3050d47f4..0a0c1b5a2c 100644 --- a/typescript/browser/README.md +++ b/typescript/browser/README.md @@ -30,4 +30,4 @@ async function onInputOrDrop(event: InputEvent | DragEvent) { ## Stay in touch -Join our [Slack channel](https://foxglove.dev/slack) to ask questions, share feedback, and stay up to date on what our team is working on. +Join our [Discord community](https://foxglove.dev/chat) to ask questions, share feedback, and stay up to date on what our team is working on. diff --git a/typescript/core/README.md b/typescript/core/README.md index 2e007000cd..38a49310f7 100644 --- a/typescript/core/README.md +++ b/typescript/core/README.md @@ -14,4 +14,4 @@ Examples of how to use the `@mcap/core` APIs can be found in the [TypeScript exa ## Stay in touch -Join our [Slack channel](https://foxglove.dev/slack) to ask questions, share feedback, and stay up to date on what our team is working on. +Join our [Discord community](https://foxglove.dev/chat) to ask questions, share feedback, and stay up to date on what our team is working on. diff --git a/typescript/nodejs/README.md b/typescript/nodejs/README.md index cf4b87998d..fe3e135290 100644 --- a/typescript/nodejs/README.md +++ b/typescript/nodejs/README.md @@ -47,4 +47,4 @@ const writer = new McapWriter({ ## Stay in touch -Join our [Slack channel](https://foxglove.dev/slack) to ask questions, share feedback, and stay up to date on what our team is working on. +Join our [Discord community](https://foxglove.dev/chat) to ask questions, share feedback, and stay up to date on what our team is working on. diff --git a/typescript/support/README.md b/typescript/support/README.md index 73491b3c73..4bd7261041 100644 --- a/typescript/support/README.md +++ b/typescript/support/README.md @@ -46,4 +46,4 @@ const reader = await McapIndexedReader.Initialize({ ## Stay in touch -Join our [Slack channel](https://foxglove.dev/slack) to ask questions, share feedback, and stay up to date on what our team is working on. +Join our [Discord community](https://foxglove.dev/chat) to ask questions, share feedback, and stay up to date on what our team is working on. diff --git a/website/docusaurus.config.js b/website/docusaurus.config.js index ec210baf19..a2c78a0d09 100644 --- a/website/docusaurus.config.js +++ b/website/docusaurus.config.js @@ -115,8 +115,8 @@ const config = { label: "Specification", }, { - href: "https://foxglove.dev/slack", - label: "Slack", + href: "https://foxglove.dev/chat", + label: "Discord", position: "right", }, { @@ -154,8 +154,8 @@ const config = { href: "https://github.com/foxglove/mcap", }, { - label: "Slack", - href: "https://foxglove.dev/slack", + label: "Discord", + href: "https://foxglove.dev/chat", }, { label: "Stack Overflow", From 6a2fe3553d0a5aa66f2634fa70a8c606780f8e0c Mon Sep 17 00:00:00 2001 From: Kirill Morozov Date: Thu, 12 Sep 2024 20:53:54 +0300 Subject: [PATCH 35/44] docs: specify units for `log_time` and `publish_time` (#1235) --- python/mcap/mcap/writer.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/python/mcap/mcap/writer.py b/python/mcap/mcap/writer.py index e1ff466a46..0383454e2c 100644 --- a/python/mcap/mcap/writer.py +++ b/python/mcap/mcap/writer.py @@ -169,9 +169,10 @@ def add_message( :param channel_id: The id of the channel to which the message should be added. :param sequence: Optional message counter assigned by publisher. - :param log_time: Time at which the message was recorded. - :param publish_time: Time at which the message was published. If not available, must be set - to the log time. + :param log_time: Time at which the message was recorded as nanoseconds since a + user-understood epoch (i.e unix epoch, robot boot time, etc.). + :param publish_time: Time at which the message was published as nanoseconds since a + user-understood epoch (i.e unix epoch, robot boot time, etc.). :param data: Message data, to be decoded according to the schema of the channel. """ message = Message( From c67c6328eff14193c21762466912ef890255a54a Mon Sep 17 00:00:00 2001 From: james-rms Date: Sun, 15 Sep 2024 21:35:22 +1000 Subject: [PATCH 36/44] rust: add async reading functionality (#1211) ### Changelog - rust: switches the LZ4 compression dependency from `lz4_flex` to `lz4-rs`. This moves us from using a pure-rust lz4 implementation to C bindings. I believe this is worthwhile because `lz4_flex` does not support LZ4 "high compression mode". The practical reason for doing so in this PR is that `lz4_flex` does not expose interfaces that make it easy to build an AsyncRead adapter for it, but `lz4-rs` does. - rust: Adds structs to read MCAP data asynchronously in a linear stream. ### Docs - Check generated rust docs for review. ### Description Adds an async `RecordReader`implementation, for reading MCAP data asynchronously. This is an optional feature, named `tokio`. I chose this feature flag name and this module name because this functionality is tied heavily into the Tokio ecosystem. If at some point we rebuild this to be async-executor-agnostic, we can add that functionality under a new module and feature flag name.
BeforeAfter
--- .github/workflows/ci.yml | 13 +- rust/Cargo.toml | 18 +- rust/examples/common/serialization.rs | 133 ++++++ rust/examples/conformance_reader.rs | 137 +----- rust/examples/conformance_reader_async.rs | 34 ++ rust/src/lib.rs | 6 +- rust/src/read.rs | 9 +- rust/src/records.rs | 38 ++ rust/src/tokio.rs | 8 + rust/src/tokio/lz4.rs | 146 ++++++ rust/src/tokio/read.rs | 434 ++++++++++++++++++ rust/src/tokio/read_exact_or_zero.rs | 103 +++++ rust/src/write.rs | 15 +- rust/tests/attachment.rs | 3 +- rust/tests/metadata.rs | 3 +- .../runners/RustAsyncReaderTestRunner.ts | 22 + .../scripts/run-tests/runners/index.ts | 2 + 17 files changed, 971 insertions(+), 153 deletions(-) create mode 100644 rust/examples/common/serialization.rs create mode 100644 rust/examples/conformance_reader_async.rs create mode 100644 rust/src/tokio.rs create mode 100644 rust/src/tokio/lz4.rs create mode 100644 rust/src/tokio/read.rs create mode 100644 rust/src/tokio/read_exact_or_zero.rs create mode 100644 tests/conformance/scripts/run-tests/runners/RustAsyncReaderTestRunner.ts diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index e7e2083df1..e4c9ae282c 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -160,7 +160,8 @@ jobs: with: toolchain: stable default: true - - run: cd rust && cargo build --example=conformance_reader + - run: cargo build --example=conformance_reader --example=conformance_reader_async --features=tokio + working-directory: rust - run: yarn install --immutable - run: yarn test:conformance:generate-inputs --verify - run: yarn test:conformance --runner rust- @@ -490,13 +491,19 @@ jobs: toolchain: stable default: true components: "rustfmt, clippy" + - run: rustup target add wasm32-unknown-unknown - run: cargo fmt --all -- --check - run: cargo clippy -- --no-deps - run: cargo clippy --no-default-features -- --no-deps - run: cargo clippy --no-default-features --features lz4 -- --no-deps - run: cargo clippy --no-default-features --features zstd -- --no-deps - - run: cargo build - - run: cargo test + - run: cargo clippy --no-default-features --features tokio -- --no-deps + - run: cargo clippy --no-default-features --features tokio,lz4 -- --no-deps + - run: cargo clippy --no-default-features --features tokio,zstd -- --no-deps + - run: cargo build --all-features + - run: cargo test --all-features + - run: cargo build --all-features --target wasm32-unknown-unknown + - run: cargo check --all-features --target wasm32-unknown-unknown - name: "publish to crates.io" if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags/releases/rust/v') run: cargo publish --token ${{ secrets.RUST_CRATES_IO_TOKEN }} diff --git a/rust/Cargo.toml b/rust/Cargo.toml index f2fa8ca0d2..0cd54d80ed 100644 --- a/rust/Cargo.toml +++ b/rust/Cargo.toml @@ -7,7 +7,7 @@ categories = [ "science::robotics", "compression" ] repository = "https://github.com/foxglove/mcap" documentation = "https://docs.rs/mcap" readme = "README.md" -version = "0.9.2" +version = "0.10.0" edition = "2021" license = "MIT" @@ -22,7 +22,9 @@ log = "0.4" num_cpus = "1.13" paste = "1.0" thiserror = "1.0" -lz4_flex = { version = "0.11.1", optional = true } +lz4 = { version = "1.27", optional = true } +async-compression = { version = "*", features = ["tokio"], optional = true } +tokio = { version = "1", features = ["io-util"] , optional = true } [target.'cfg(target_arch = "wasm32")'.dependencies] zstd = { version = "0.11", features = ["wasm"], optional = true } @@ -32,15 +34,16 @@ zstd = { version = "0.11", features = ["zstdmt"], optional = true } [features] default = ["zstd", "lz4"] -zstd = ["dep:zstd"] -lz4 = ["dep:lz4_flex"] +zstd = ["dep:zstd", "async-compression/zstd"] +lz4 = ["dep:lz4"] +tokio = ["dep:async-compression", "dep:tokio"] [dev-dependencies] anyhow = "1" atty = "0.2" camino = "1.0" clap = { version = "3.2", features = ["derive"]} -criterion = "0.5.1" +criterion = { version = "0.5.1", features = ["async_tokio"] } itertools = "0.10" memmap = "0.7" rayon = "1.5" @@ -48,6 +51,7 @@ serde = { version = "1.0.145", features = ["derive"] } serde_json = "1" simplelog = "0.12" tempfile = "3.3" +tokio = { version = "1", features = ["io-util", "macros", "rt", "fs"] } [[bench]] name = "reader" @@ -57,3 +61,7 @@ harness = false opt-level = 3 debug = true lto = true + +[[example]] +name = "conformance_reader_async" +required-features = ["tokio"] diff --git a/rust/examples/common/serialization.rs b/rust/examples/common/serialization.rs new file mode 100644 index 0000000000..73c1c7f9e4 --- /dev/null +++ b/rust/examples/common/serialization.rs @@ -0,0 +1,133 @@ +use mcap::records::Record; + +use std::collections::BTreeMap; + +use serde_json::{json, Value}; + +// We don't want to force Serde on users just for the sake of the conformance tests. +// (In what context would you want to serialize individual records of a MCAP?) +// Stamp out and stringify them ourselves: + +fn get_type(rec: &Record<'_>) -> &'static str { + match rec { + Record::Header(_) => "Header", + Record::Footer(_) => "Footer", + Record::Schema { .. } => "Schema", + Record::Channel(_) => "Channel", + Record::Message { .. } => "Message", + Record::Chunk { .. } => "Chunk", + Record::MessageIndex(_) => "MessageIndex", + Record::ChunkIndex(_) => "ChunkIndex", + Record::Attachment { .. } => "Attachment", + Record::AttachmentIndex(_) => "AttachmentIndex", + Record::Statistics(_) => "Statistics", + Record::Metadata(_) => "Metadata", + Record::MetadataIndex(_) => "MetadataIndex", + Record::SummaryOffset(_) => "SummaryOffset", + Record::DataEnd(_) => "DataEnd", + Record::Unknown { opcode, .. } => { + panic!("Unknown record in conformance test: (op {opcode})") + } + } +} + +fn get_fields(rec: &Record<'_>) -> Value { + fn b2s(bytes: &[u8]) -> Vec { + bytes.iter().map(|b| b.to_string()).collect() + } + fn m2s(map: &BTreeMap) -> BTreeMap { + map.iter() + .map(|(k, v)| (k.to_string(), v.to_string())) + .collect() + } + + match rec { + Record::Header(h) => json!([["library", h.library], ["profile", h.profile]]), + Record::Footer(f) => json!([ + ["summary_crc", f.summary_crc.to_string()], + ["summary_offset_start", f.summary_offset_start.to_string()], + ["summary_start", f.summary_start.to_string()] + ]), + Record::Schema { header, data } => json!([ + ["data", b2s(data)], + ["encoding", header.encoding], + ["id", header.id.to_string()], + ["name", header.name] + ]), + Record::Channel(c) => json!([ + ["id", c.id.to_string()], + ["message_encoding", c.message_encoding], + ["metadata", c.metadata], + ["schema_id", c.schema_id.to_string()], + ["topic", c.topic] + ]), + Record::Message { header, data } => json!([ + ["channel_id", header.channel_id.to_string()], + ["data", b2s(data)], + ["log_time", header.log_time.to_string()], + ["publish_time", header.publish_time.to_string()], + ["sequence", header.sequence.to_string()] + ]), + Record::Chunk { .. } => unreachable!("Chunks are flattened"), + Record::MessageIndex(_) => unreachable!("MessageIndexes are skipped"), + Record::ChunkIndex(i) => json!([ + ["chunk_length", i.chunk_length.to_string()], + ["chunk_start_offset", i.chunk_start_offset.to_string()], + ["compressed_size", i.compressed_size.to_string()], + ["compression", i.compression], + ["message_end_time", i.message_end_time.to_string()], + ["message_index_length", i.message_index_length.to_string()], + ["message_index_offsets", m2s(&i.message_index_offsets)], + ["message_start_time", i.message_start_time.to_string()], + ["uncompressed_size", i.uncompressed_size.to_string()] + ]), + Record::Attachment { header, data } => json!([ + ["create_time", header.create_time.to_string()], + ["data", b2s(data)], + ["log_time", header.log_time.to_string()], + ["media_type", header.media_type], + ["name", header.name] + ]), + Record::AttachmentIndex(i) => json!([ + ["create_time", i.create_time.to_string()], + ["data_size", i.data_size.to_string()], + ["length", i.length.to_string()], + ["log_time", i.log_time.to_string()], + ["media_type", i.media_type], + ["name", i.name], + ["offset", i.offset.to_string()] + ]), + Record::Statistics(s) => json!([ + ["attachment_count", s.attachment_count.to_string()], + ["channel_count", s.channel_count.to_string()], + ["channel_message_counts", m2s(&s.channel_message_counts)], + ["chunk_count", s.chunk_count.to_string()], + ["message_count", s.message_count.to_string()], + ["message_end_time", s.message_end_time.to_string()], + ["message_start_time", s.message_start_time.to_string()], + ["metadata_count", s.metadata_count.to_string()], + ["schema_count", s.schema_count.to_string()] + ]), + Record::Metadata(m) => json!([["metadata", m.metadata], ["name", m.name]]), + Record::MetadataIndex(i) => json!([ + ["length", i.length.to_string()], + ["name", i.name], + ["offset", i.offset.to_string()] + ]), + Record::SummaryOffset(s) => json!([ + ["group_length", s.group_length.to_string()], + ["group_opcode", s.group_opcode.to_string()], + ["group_start", s.group_start.to_string()] + ]), + Record::DataEnd(d) => json!([["data_section_crc", d.data_section_crc.to_string()]]), + Record::Unknown { opcode, .. } => { + panic!("Unknown record in conformance test: (op {opcode})") + } + } +} + +pub fn as_json(view: &Record<'_>) -> Value { + let typename = get_type(view); + let fields = get_fields(view); + json!({"type": typename, "fields": fields}) +} diff --git a/rust/examples/conformance_reader.rs b/rust/examples/conformance_reader.rs index 941ee9386e..3bfa97ae19 100644 --- a/rust/examples/conformance_reader.rs +++ b/rust/examples/conformance_reader.rs @@ -1,136 +1,11 @@ -use mcap::records::Record; - -use std::{collections::BTreeMap, env, process}; +#[path = "common/serialization.rs"] +mod serialization; use serde_json::{json, Value}; -// We don't want to force Serde on users just for the sake of the conformance tests. -// (In what context would you want to serialize individual records of a MCAP?) -// Stamp out and stringify them ourselves: - -fn get_type(rec: &Record<'_>) -> &'static str { - match rec { - Record::Header(_) => "Header", - Record::Footer(_) => "Footer", - Record::Schema { .. } => "Schema", - Record::Channel(_) => "Channel", - Record::Message { .. } => "Message", - Record::Chunk { .. } => "Chunk", - Record::MessageIndex(_) => "MessageIndex", - Record::ChunkIndex(_) => "ChunkIndex", - Record::Attachment { .. } => "Attachment", - Record::AttachmentIndex(_) => "AttachmentIndex", - Record::Statistics(_) => "Statistics", - Record::Metadata(_) => "Metadata", - Record::MetadataIndex(_) => "MetadataIndex", - Record::SummaryOffset(_) => "SummaryOffset", - Record::DataEnd(_) => "DataEnd", - Record::Unknown { opcode, .. } => { - panic!("Unknown record in conformance test: (op {opcode})") - } - } -} - -fn get_fields(rec: &Record<'_>) -> Value { - fn b2s(bytes: &[u8]) -> Vec { - bytes.iter().map(|b| b.to_string()).collect() - } - fn m2s(map: &BTreeMap) -> BTreeMap { - map.iter() - .map(|(k, v)| (k.to_string(), v.to_string())) - .collect() - } - - match rec { - Record::Header(h) => json!([["library", h.library], ["profile", h.profile]]), - Record::Footer(f) => json!([ - ["summary_crc", f.summary_crc.to_string()], - ["summary_offset_start", f.summary_offset_start.to_string()], - ["summary_start", f.summary_start.to_string()] - ]), - Record::Schema { header, data } => json!([ - ["data", b2s(data)], - ["encoding", header.encoding], - ["id", header.id.to_string()], - ["name", header.name] - ]), - Record::Channel(c) => json!([ - ["id", c.id.to_string()], - ["message_encoding", c.message_encoding], - ["metadata", c.metadata], - ["schema_id", c.schema_id.to_string()], - ["topic", c.topic] - ]), - Record::Message { header, data } => json!([ - ["channel_id", header.channel_id.to_string()], - ["data", b2s(data)], - ["log_time", header.log_time.to_string()], - ["publish_time", header.publish_time.to_string()], - ["sequence", header.sequence.to_string()] - ]), - Record::Chunk { .. } => unreachable!("Chunks are flattened"), - Record::MessageIndex(_) => unreachable!("MessageIndexes are skipped"), - Record::ChunkIndex(i) => json!([ - ["chunk_length", i.chunk_length.to_string()], - ["chunk_start_offset", i.chunk_start_offset.to_string()], - ["compressed_size", i.compressed_size.to_string()], - ["compression", i.compression], - ["message_end_time", i.message_end_time.to_string()], - ["message_index_length", i.message_index_length.to_string()], - ["message_index_offsets", m2s(&i.message_index_offsets)], - ["message_start_time", i.message_start_time.to_string()], - ["uncompressed_size", i.uncompressed_size.to_string()] - ]), - Record::Attachment { header, data } => json!([ - ["create_time", header.create_time.to_string()], - ["data", b2s(data)], - ["log_time", header.log_time.to_string()], - ["media_type", header.media_type], - ["name", header.name] - ]), - Record::AttachmentIndex(i) => json!([ - ["create_time", i.create_time.to_string()], - ["data_size", i.data_size.to_string()], - ["length", i.length.to_string()], - ["log_time", i.log_time.to_string()], - ["media_type", i.media_type], - ["name", i.name], - ["offset", i.offset.to_string()] - ]), - Record::Statistics(s) => json!([ - ["attachment_count", s.attachment_count.to_string()], - ["channel_count", s.channel_count.to_string()], - ["channel_message_counts", m2s(&s.channel_message_counts)], - ["chunk_count", s.chunk_count.to_string()], - ["message_count", s.message_count.to_string()], - ["message_end_time", s.message_end_time.to_string()], - ["message_start_time", s.message_start_time.to_string()], - ["metadata_count", s.metadata_count.to_string()], - ["schema_count", s.schema_count.to_string()] - ]), - Record::Metadata(m) => json!([["metadata", m.metadata], ["name", m.name]]), - Record::MetadataIndex(i) => json!([ - ["length", i.length.to_string()], - ["name", i.name], - ["offset", i.offset.to_string()] - ]), - Record::SummaryOffset(s) => json!([ - ["group_length", s.group_length.to_string()], - ["group_opcode", s.group_opcode.to_string()], - ["group_start", s.group_start.to_string()] - ]), - Record::DataEnd(d) => json!([["data_section_crc", d.data_section_crc.to_string()]]), - Record::Unknown { opcode, .. } => { - panic!("Unknown record in conformance test: (op {opcode})") - } - } -} - -fn as_json(view: &Record<'_>) -> Value { - let typename = get_type(view); - let fields = get_fields(view); - json!({"type": typename, "fields": fields}) -} +use mcap::records::Record; +use std::env; +use std::process; pub fn main() { let args: Vec = env::args().collect(); @@ -143,7 +18,7 @@ pub fn main() { for rec in mcap::read::ChunkFlattener::new(&file).expect("Couldn't read file") { let r = rec.expect("failed to read next record"); if !matches!(r, Record::MessageIndex(_)) { - json_records.push(as_json(&r)); + json_records.push(serialization::as_json(&r)); } } let out = json!({ "records": json_records }); diff --git a/rust/examples/conformance_reader_async.rs b/rust/examples/conformance_reader_async.rs new file mode 100644 index 0000000000..ba8dfb94a8 --- /dev/null +++ b/rust/examples/conformance_reader_async.rs @@ -0,0 +1,34 @@ +#[path = "common/serialization.rs"] +mod serialization; + +use serde_json::{json, Value}; + +use serialization::as_json; +use std::env; +use std::process; +use tokio::fs::File; + +use tokio; + +#[tokio::main(flavor = "current_thread")] +async fn main() { + let args: Vec = env::args().collect(); + if args.len() < 2 { + eprintln!("Please supply an MCAP file as argument"); + process::exit(1); + } + let file = File::open(&args[1]).await.expect("couldn't open file"); + let mut reader = mcap::tokio::RecordReader::new(file); + + let mut json_records: Vec = vec![]; + let mut buf: Vec = Vec::new(); + while let Some(opcode) = reader.next_record(&mut buf).await { + let opcode = opcode.expect("failed to read next record"); + if opcode != mcap::records::op::MESSAGE_INDEX { + let parsed = mcap::parse_record(opcode, &buf[..]).expect("failed to parse record"); + json_records.push(as_json(&parsed)); + } + } + let out = json!({ "records": json_records }); + print!("{}", serde_json::to_string_pretty(&out).unwrap()); +} diff --git a/rust/src/lib.rs b/rust/src/lib.rs index 9b2eb1b1c7..bdb1f389d9 100644 --- a/rust/src/lib.rs +++ b/rust/src/lib.rs @@ -75,6 +75,8 @@ pub mod read; pub mod records; +#[cfg(feature = "tokio")] +pub mod tokio; pub mod write; mod io_utils; @@ -119,6 +121,8 @@ pub enum McapError { UnexpectedEof, #[error("Chunk ended in the middle of a record")] UnexpectedEoc, + #[error("Record with opcode {opcode:02X} has length {len}, need at least {expected} to parse")] + RecordTooShort { opcode: u8, len: u64, expected: u64 }, #[error("Message {0} referenced unknown channel {1}")] UnknownChannel(u32, u16), #[error("Channel `{0}` referenced unknown schema {1}")] @@ -196,5 +200,5 @@ pub struct Attachment<'a> { pub data: Cow<'a, [u8]>, } -pub use read::{MessageStream, Summary}; +pub use read::{parse_record, MessageStream, Summary}; pub use write::{WriteOptions, Writer}; diff --git a/rust/src/read.rs b/rust/src/read.rs index 72dad85fb2..94d8d53607 100644 --- a/rust/src/read.rs +++ b/rust/src/read.rs @@ -136,15 +136,16 @@ fn read_record_from_slice<'a>(buf: &mut &'a [u8]) -> McapResult McapResult> { +/// Given a records' opcode and data, parse into a Record. The resulting Record will contain +/// borrowed slices from `body`. +pub fn parse_record(op: u8, body: &[u8]) -> McapResult> { macro_rules! record { ($b:ident) => {{ let mut cur = Cursor::new($b); @@ -278,7 +279,7 @@ impl<'a> ChunkReader<'a> { #[cfg(feature = "lz4")] "lz4" => ChunkDecompressor::Compressed(Some(CountingCrcReader::new(Box::new( - lz4_flex::frame::FrameDecoder::new(data), + lz4::Decoder::new(data)?, )))), #[cfg(not(feature = "lz4"))] diff --git a/rust/src/records.rs b/rust/src/records.rs index eeb842c495..0ac5d7f72a 100644 --- a/rust/src/records.rs +++ b/rust/src/records.rs @@ -99,6 +99,44 @@ impl Record<'_> { Record::Unknown { opcode, .. } => *opcode, } } + + /// Moves this value into a fully-owned variant with no borrows. This should be free for + /// already-owned values. + pub fn into_owned(self) -> Record<'static> { + match self { + Record::Header(header) => Record::Header(header), + Record::Footer(footer) => Record::Footer(footer), + Record::Schema { header, data } => Record::Schema { + header, + data: Cow::Owned(data.into_owned()), + }, + Record::Channel(channel) => Record::Channel(channel), + Record::Message { header, data } => Record::Message { + header, + data: Cow::Owned(data.into_owned()), + }, + Record::Chunk { header, data } => Record::Chunk { + header, + data: Cow::Owned(data.into_owned()), + }, + Record::MessageIndex(index) => Record::MessageIndex(index), + Record::ChunkIndex(index) => Record::ChunkIndex(index), + Record::Attachment { header, data } => Record::Attachment { + header, + data: Cow::Owned(data.into_owned()), + }, + Record::AttachmentIndex(index) => Record::AttachmentIndex(index), + Record::Statistics(statistics) => Record::Statistics(statistics), + Record::Metadata(metadata) => Record::Metadata(metadata), + Record::MetadataIndex(index) => Record::MetadataIndex(index), + Record::SummaryOffset(offset) => Record::SummaryOffset(offset), + Record::DataEnd(end) => Record::DataEnd(end), + Record::Unknown { opcode, data } => Record::Unknown { + opcode, + data: Cow::Owned(data.into_owned()), + }, + } + } } #[binrw] diff --git a/rust/src/tokio.rs b/rust/src/tokio.rs new file mode 100644 index 0000000000..cf3d68ebd5 --- /dev/null +++ b/rust/src/tokio.rs @@ -0,0 +1,8 @@ +//! Read MCAP data from a stream asynchronously +#[cfg(feature = "lz4")] +mod lz4; +pub mod read; +mod read_exact_or_zero; + +pub use read::{RecordReader, RecordReaderOptions}; +use read_exact_or_zero::read_exact_or_zero; diff --git a/rust/src/tokio/lz4.rs b/rust/src/tokio/lz4.rs new file mode 100644 index 0000000000..05a0f65a9e --- /dev/null +++ b/rust/src/tokio/lz4.rs @@ -0,0 +1,146 @@ +use std::io::{Error, ErrorKind, Result}; +use std::pin::{pin, Pin}; +use std::ptr; +use std::task::{Context, Poll}; + +use lz4::liblz4::{ + check_error, LZ4FDecompressionContext, LZ4F_createDecompressionContext, LZ4F_decompress, + LZ4F_freeDecompressionContext, LZ4F_VERSION, +}; +use tokio::io::{AsyncRead, ReadBuf}; + +const BUFFER_SIZE: usize = 32 * 1024; + +#[derive(Debug)] +struct DecoderContext { + c: LZ4FDecompressionContext, +} + +// An adaptation of the [`lz4::Decoder`] [`std::io::Read`] impl, but for [`tokio::io::AsyncRead`]. +// Code below is adapted from the [lz4](https://github.com/10XGenomics/lz4-rs) crate source. +#[derive(Debug)] +pub struct Lz4Decoder { + c: DecoderContext, + r: R, + input_buf: Box<[u8]>, + unread_input_start: usize, + unread_input_end: usize, + next: usize, +} + +impl Lz4Decoder { + /// Creates a new decoder which reads its input from the given + /// input stream. The input stream can be re-acquired by calling + /// `finish()` + pub fn new(r: R) -> Result> { + Ok(Lz4Decoder { + r, + c: DecoderContext::new()?, + input_buf: vec![0; BUFFER_SIZE].into_boxed_slice(), + unread_input_start: BUFFER_SIZE, + unread_input_end: BUFFER_SIZE, + // Minimal LZ4 stream size + next: 11, + }) + } + + pub fn finish(self) -> (R, Result<()>) { + ( + self.r, + match self.next { + 0 => Ok(()), + _ => Err(Error::new( + ErrorKind::Interrupted, + "Finish called before end of compressed stream", + )), + }, + ) + } +} + +impl AsyncRead for Lz4Decoder { + fn poll_read( + self: Pin<&mut Self>, + cx: &mut Context<'_>, + output_buf: &mut ReadBuf<'_>, + ) -> Poll> { + // Thre's nothing left to read. + if self.next == 0 || output_buf.remaining() == 0 { + return Poll::Ready(Ok(())); + } + let mut written_len: usize = 0; + let this = self.get_mut(); + while written_len == 0 { + // this reader buffers input data until it has enough to present to the lz4 frame decoder. + // if there's nothing unread, request more data from the reader. + if this.unread_input_start >= this.unread_input_end { + // request a full BUFFER_SIZE or the amount requested by the lz4 frame decoder, + // whichever is less. + let need = std::cmp::min(BUFFER_SIZE, this.next); + // try reading more input data. If it's not ready, return and try again later. + // NOTE: we don't need to save this stack frame as a future and re-enter it later + // because the only frame-local state `written_len` has not been modified and can be + // discarded. + { + let mut input_buf = ReadBuf::new(&mut this.input_buf[..need]); + let result = pin!(&mut this.r).poll_read(cx, &mut input_buf); + match result { + Poll::Pending => return result, + Poll::Ready(Err(_)) => return result, + _ => {} + }; + this.unread_input_start = 0; + this.unread_input_end = input_buf.filled().len(); + this.next -= this.unread_input_end; + } + // The read succeeded. If zero bytes were read, we're at the end of the stream. + if this.unread_input_end == 0 { + return Poll::Ready(Ok(())); + } + } + // feed bytes from our input buffer into the compressor, writing into the output + // buffer until either the output buffer is full or the input buffer is consumed. + while (written_len < output_buf.remaining()) + && (this.unread_input_start < this.unread_input_end) + { + let mut src_size = this.unread_input_end - this.unread_input_start; + let mut dst_size = output_buf.remaining() - written_len; + let prev_filled = output_buf.filled().len(); + let len = check_error(unsafe { + LZ4F_decompress( + this.c.c, + output_buf.initialize_unfilled().as_mut_ptr(), + &mut dst_size, + this.input_buf[this.unread_input_start..].as_ptr(), + &mut src_size, + ptr::null(), + ) + })?; + this.unread_input_start += src_size; + written_len += dst_size; + output_buf.set_filled(prev_filled + written_len); + if len == 0 { + this.next = 0; + return Poll::Ready(Ok(())); + } else if this.next < len { + this.next = len; + } + } + } + Poll::Ready(Ok(())) + } +} + +impl DecoderContext { + fn new() -> Result { + let mut context = LZ4FDecompressionContext(ptr::null_mut()); + check_error(unsafe { LZ4F_createDecompressionContext(&mut context, LZ4F_VERSION) })?; + Ok(DecoderContext { c: context }) + } +} + +impl Drop for DecoderContext { + fn drop(&mut self) { + unsafe { LZ4F_freeDecompressionContext(self.c) }; + } +} diff --git a/rust/src/tokio/read.rs b/rust/src/tokio/read.rs new file mode 100644 index 0000000000..50a1384250 --- /dev/null +++ b/rust/src/tokio/read.rs @@ -0,0 +1,434 @@ +use std::pin::{pin, Pin}; +use std::task::{Context, Poll}; + +#[cfg(feature = "zstd")] +use async_compression::tokio::bufread::ZstdDecoder; +use binrw::BinReaderExt; +use tokio::io::{AsyncRead, AsyncReadExt, ReadBuf, Take}; + +#[cfg(feature = "lz4")] +use crate::tokio::lz4::Lz4Decoder; +use crate::tokio::read_exact_or_zero; +use crate::{records, McapError, McapResult, MAGIC}; + +enum ReaderState { + Base(R), + UncompressedChunk(Take), + #[cfg(feature = "zstd")] + ZstdChunk(ZstdDecoder>>), + #[cfg(feature = "lz4")] + Lz4Chunk(Lz4Decoder>), + Empty, +} + +impl AsyncRead for ReaderState +where + R: AsyncRead + std::marker::Unpin, +{ + fn poll_read( + self: Pin<&mut Self>, + cx: &mut Context<'_>, + buf: &mut ReadBuf<'_>, + ) -> Poll> { + match self.get_mut() { + ReaderState::Base(r) => pin!(r).poll_read(cx, buf), + ReaderState::UncompressedChunk(r) => pin!(r).poll_read(cx, buf), + #[cfg(feature = "zstd")] + ReaderState::ZstdChunk(r) => pin!(r).poll_read(cx, buf), + #[cfg(feature = "lz4")] + ReaderState::Lz4Chunk(r) => pin!(r).poll_read(cx, buf), + ReaderState::Empty => { + panic!("invariant: reader is only set to empty while swapping with another valid variant") + } + } + } +} +impl ReaderState +where + R: AsyncRead, +{ + pub fn into_inner(self) -> McapResult { + match self { + ReaderState::Base(reader) => Ok(reader), + ReaderState::UncompressedChunk(take) => Ok(take.into_inner()), + #[cfg(feature = "zstd")] + ReaderState::ZstdChunk(decoder) => Ok(decoder.into_inner().into_inner().into_inner()), + #[cfg(feature = "lz4")] + ReaderState::Lz4Chunk(decoder) => { + let (output, result) = decoder.finish(); + result?; + Ok(output.into_inner()) + } + ReaderState::Empty => { + panic!("invariant: reader is only set to empty while swapping with another valid variant") + } + } + } +} + +/// Reads an MCAP file record-by-record, writing the raw record data into a caller-provided Vec. +/// ```no_run +/// use std::fs; +/// +/// use tokio::fs::File; +/// +/// async fn read_it() { +/// let file = File::open("in.mcap").await.expect("couldn't open file"); +/// let mut record_buf: Vec = Vec::new(); +/// let mut reader = mcap::tokio::RecordReader::new(file); +/// while let Some(result) = reader.next_record(&mut record_buf).await { +/// let opcode = result.expect("couldn't read next record"); +/// let raw_record = mcap::parse_record(opcode, &record_buf[..]).expect("couldn't parse"); +/// // do something with the record... +/// } +/// } +/// ``` +pub struct RecordReader { + reader: ReaderState, + options: RecordReaderOptions, + start_magic_seen: bool, + footer_seen: bool, + to_discard_after_chunk: usize, + scratch: Box<[u8]>, +} + +#[derive(Default, Clone)] +pub struct RecordReaderOptions { + /// If true, the reader will not expect the MCAP magic at the start of the stream. + pub skip_start_magic: bool, + /// If true, the reader will not expect the MCAP magic at the end of the stream. + pub skip_end_magic: bool, + /// If true, the reader will yield entire chunk records. Otherwise, the reader will decompress + /// and read into the chunk, yielding the records inside. + pub emit_chunks: bool, +} + +enum Cmd { + YieldRecord(u8), + EnterChunk { + header: records::ChunkHeader, + len: u64, + }, + ExitChunk, + Stop, +} + +impl RecordReader +where + R: AsyncRead + std::marker::Unpin, +{ + pub fn new(reader: R) -> Self { + Self::new_with_options(reader, &RecordReaderOptions::default()) + } + + pub fn new_with_options(reader: R, options: &RecordReaderOptions) -> Self { + Self { + reader: ReaderState::Base(reader), + options: options.clone(), + start_magic_seen: false, + footer_seen: false, + to_discard_after_chunk: 0, + scratch: vec![0; 1024].into_boxed_slice(), + } + } + + pub fn into_inner(self) -> McapResult { + self.reader.into_inner() + } + + /// Reads the next record from the input stream and copies the raw content into `data`. + /// Returns the record's opcode as a result. + pub async fn next_record(&mut self, data: &mut Vec) -> Option> { + loop { + let cmd = match self.next_record_inner(data).await { + Ok(cmd) => cmd, + Err(err) => return Some(Err(err)), + }; + match cmd { + Cmd::Stop => return None, + Cmd::YieldRecord(opcode) => return Some(Ok(opcode)), + Cmd::EnterChunk { header, len } => { + let mut reader_state = ReaderState::Empty; + std::mem::swap(&mut reader_state, &mut self.reader); + match header.compression.as_str() { + #[cfg(feature = "zstd")] + "zstd" => { + let reader = match reader_state.into_inner() { + Ok(reader) => reader, + Err(err) => return Some(Err(err)), + }; + self.reader = ReaderState::ZstdChunk(ZstdDecoder::new( + tokio::io::BufReader::new(reader.take(header.compressed_size)), + )); + } + #[cfg(feature = "lz4")] + "lz4" => { + let reader = match reader_state.into_inner() { + Ok(reader) => reader, + Err(err) => return Some(Err(err)), + }; + let decoder = match Lz4Decoder::new(reader.take(header.compressed_size)) + { + Ok(decoder) => decoder, + Err(err) => return Some(Err(err.into())), + }; + self.reader = ReaderState::Lz4Chunk(decoder); + } + "" => { + let reader = match reader_state.into_inner() { + Ok(reader) => reader, + Err(err) => return Some(Err(err)), + }; + self.reader = + ReaderState::UncompressedChunk(reader.take(header.compressed_size)); + } + _ => { + std::mem::swap(&mut reader_state, &mut self.reader); + return Some(Err(McapError::UnsupportedCompression( + header.compression.clone(), + ))); + } + } + self.to_discard_after_chunk = len as usize + - (40 + header.compression.len() + header.compressed_size as usize); + } + Cmd::ExitChunk => { + let mut reader_state = ReaderState::Empty; + std::mem::swap(&mut reader_state, &mut self.reader); + self.reader = ReaderState::Base(match reader_state.into_inner() { + Ok(reader) => reader, + Err(err) => return Some(Err(err)), + }); + while self.to_discard_after_chunk > 0 { + let to_read = if self.to_discard_after_chunk > self.scratch.len() { + self.scratch.len() + } else { + self.to_discard_after_chunk + }; + match self.reader.read(&mut self.scratch[..to_read]).await { + Ok(n) => self.to_discard_after_chunk -= n, + Err(err) => return Some(Err(err.into())), + }; + } + } + }; + } + } + + async fn next_record_inner(&mut self, data: &mut Vec) -> McapResult { + if let ReaderState::Base(reader) = &mut self.reader { + if !self.start_magic_seen && !self.options.skip_start_magic { + reader.read_exact(&mut self.scratch[..MAGIC.len()]).await?; + if &self.scratch[..MAGIC.len()] != MAGIC { + return Err(McapError::BadMagic); + } + self.start_magic_seen = true; + } + if self.footer_seen && !self.options.skip_end_magic { + reader.read_exact(&mut self.scratch[..MAGIC.len()]).await?; + if &self.scratch[..MAGIC.len()] != MAGIC { + return Err(McapError::BadMagic); + } + return Ok(Cmd::Stop); + } + let readlen = read_exact_or_zero(reader, &mut self.scratch[..9]).await?; + if readlen == 0 { + if self.options.skip_end_magic { + return Ok(Cmd::Stop); + } else { + return Err(McapError::UnexpectedEof); + } + } + let opcode = self.scratch[0]; + if opcode == records::op::FOOTER { + self.footer_seen = true; + } + let record_len = u64::from_le_bytes(self.scratch[1..9].try_into().unwrap()); + if opcode == records::op::CHUNK && !self.options.emit_chunks { + let header = read_chunk_header(reader, data, record_len).await?; + return Ok(Cmd::EnterChunk { + header, + len: record_len, + }); + } + data.resize(record_len as usize, 0); + reader.read_exact(&mut data[..]).await?; + Ok(Cmd::YieldRecord(opcode)) + } else { + let len = read_exact_or_zero(&mut self.reader, &mut self.scratch[..9]).await?; + if len == 0 { + return Ok(Cmd::ExitChunk); + } + let opcode = self.scratch[0]; + let record_len = u64::from_le_bytes(self.scratch[1..9].try_into().unwrap()); + data.resize(record_len as usize, 0); + self.reader.read_exact(&mut data[..]).await?; + Ok(Cmd::YieldRecord(opcode)) + } + } +} + +async fn read_chunk_header( + reader: &mut R, + scratch: &mut Vec, + record_len: u64, +) -> McapResult { + let mut header = records::ChunkHeader { + message_start_time: 0, + message_end_time: 0, + uncompressed_size: 0, + uncompressed_crc: 0, + compression: String::new(), + compressed_size: 0, + }; + if record_len < 40 { + return Err(McapError::RecordTooShort { + opcode: records::op::CHUNK, + len: record_len, + expected: 40, + }); + } + scratch.resize(32, 0); + reader.read_exact(&mut scratch[..]).await?; + let compression_len: u32 = { + let mut cursor = std::io::Cursor::new(&scratch); + header.message_start_time = cursor.read_le()?; + header.message_end_time = cursor.read_le()?; + header.uncompressed_size = cursor.read_le()?; + header.uncompressed_crc = cursor.read_le()?; + cursor.read_le()? + }; + scratch.resize(compression_len as usize, 0); + if record_len < (40 + compression_len) as u64 { + return Err(McapError::RecordTooShort { + opcode: records::op::CHUNK, + len: record_len, + expected: (40 + compression_len) as u64, + }); + } + reader.read_exact(&mut scratch[..]).await?; + header.compression = match std::str::from_utf8(&scratch[..]) { + Ok(val) => val.to_owned(), + Err(err) => { + return Err(McapError::Parse(binrw::error::Error::Custom { + pos: 32, + err: Box::new(err), + })); + } + }; + scratch.resize(8, 0); + reader.read_exact(&mut scratch[..]).await?; + header.compressed_size = u64::from_le_bytes(scratch[..].try_into().unwrap()); + let available = record_len - (32 + compression_len as u64 + 8); + if available < header.compressed_size { + return Err(McapError::BadChunkLength { + header: header.compressed_size, + available, + }); + } + Ok(header) +} + +#[cfg(test)] +mod tests { + use crate::read::parse_record; + use std::collections::BTreeMap; + + use super::*; + #[tokio::test] + async fn test_record_reader() -> Result<(), McapError> { + for compression in [ + None, + #[cfg(feature = "zstd")] + Some(crate::Compression::Zstd), + #[cfg(feature = "lz4")] + Some(crate::Compression::Lz4), + ] { + let mut buf = std::io::Cursor::new(Vec::new()); + { + let mut writer = crate::WriteOptions::new() + .compression(compression) + .create(&mut buf)?; + let channel = std::sync::Arc::new(crate::Channel { + topic: "chat".to_owned(), + schema: None, + message_encoding: "json".to_owned(), + metadata: BTreeMap::new(), + }); + writer.add_channel(&channel)?; + writer.write(&crate::Message { + channel, + sequence: 0, + log_time: 0, + publish_time: 0, + data: (&[0, 1, 2]).into(), + })?; + writer.finish()?; + } + let mut reader = RecordReader::new(std::io::Cursor::new(buf.into_inner())); + let mut record = Vec::new(); + let mut opcodes: Vec = Vec::new(); + while let Some(opcode) = reader.next_record(&mut record).await { + let opcode = opcode?; + opcodes.push(opcode); + parse_record(opcode, &record)?; + } + assert_eq!( + opcodes.as_slice(), + [ + records::op::HEADER, + records::op::CHANNEL, + records::op::MESSAGE, + records::op::MESSAGE_INDEX, + records::op::DATA_END, + records::op::CHANNEL, + records::op::CHUNK_INDEX, + records::op::STATISTICS, + records::op::SUMMARY_OFFSET, + records::op::SUMMARY_OFFSET, + records::op::SUMMARY_OFFSET, + records::op::FOOTER, + ], + "reads opcodes from MCAP compressed with {:?}", + compression + ); + } + Ok(()) + } + #[cfg(feature = "lz4")] + #[tokio::test] + async fn test_lz4_decompression() -> Result<(), McapError> { + let mut buf = std::io::Cursor::new(Vec::new()); + { + let mut writer = crate::WriteOptions::new() + .compression(Some(crate::Compression::Lz4)) + .create(&mut buf)?; + let channel = std::sync::Arc::new(crate::Channel { + topic: "chat".to_owned(), + schema: None, + message_encoding: "json".to_owned(), + metadata: BTreeMap::new(), + }); + let data: Vec = vec![0; 1024]; + writer.add_channel(&channel)?; + for n in 0..10000 { + { + writer.write(&crate::Message { + channel: channel.clone(), + log_time: n, + publish_time: n, + sequence: n as u32, + data: std::borrow::Cow::Owned(data.clone()), + })?; + } + } + writer.finish()?; + } + let mut reader = RecordReader::new(std::io::Cursor::new(buf.into_inner())); + let mut record = Vec::new(); + while let Some(opcode) = reader.next_record(&mut record).await { + parse_record(opcode?, &record)?; + } + Ok(()) + } +} diff --git a/rust/src/tokio/read_exact_or_zero.rs b/rust/src/tokio/read_exact_or_zero.rs new file mode 100644 index 0000000000..cc4eb902d5 --- /dev/null +++ b/rust/src/tokio/read_exact_or_zero.rs @@ -0,0 +1,103 @@ +use tokio::io::{AsyncRead, AsyncReadExt}; + +/// read up to `buf.len()` bytes from `r` into `buf`. This repeatedly calls read() on `r` until +/// either the buffer is full or EOF is reached. If either 0 or buf.len() bytes were read before +/// EOF, Ok(n) is returned. If EOF is reached after 0 bytes but before buf.len(), Err(UnexpectedEOF) +/// is returned. +/// This is useful for cases where we expect either to read either a whole MCAP record or EOF. +pub(crate) async fn read_exact_or_zero( + r: &mut R, + buf: &mut [u8], +) -> Result { + let mut pos: usize = 0; + loop { + let readlen = r.read(&mut buf[pos..]).await?; + if readlen == 0 { + if pos != 0 { + return Err(std::io::ErrorKind::UnexpectedEof.into()); + } else { + return Ok(0); + } + } + pos += readlen; + if pos == buf.len() { + return Ok(pos); + } + } +} + +#[cfg(test)] +mod tests { + + use super::*; + use std::cmp::min; + + struct ZeroReader { + remaining: usize, + max_read_len: usize, + } + + impl AsyncRead for ZeroReader { + fn poll_read( + mut self: std::pin::Pin<&mut Self>, + _cx: &mut std::task::Context<'_>, + buf: &mut tokio::io::ReadBuf<'_>, + ) -> std::task::Poll> { + let max_read_len = self.as_ref().max_read_len; + let remaining = self.as_ref().remaining; + if remaining == 0 { + return std::task::Poll::Ready(Ok(())); + } + let to_fill = min(min(remaining, buf.remaining()), max_read_len); + buf.initialize_unfilled_to(to_fill).fill(0); + buf.set_filled(to_fill); + self.as_mut().remaining -= to_fill; + return std::task::Poll::Ready(Ok(())); + } + } + #[tokio::test] + async fn test_full_read_is_not_error() { + let mut r = ZeroReader { + remaining: 10, + max_read_len: 10, + }; + let mut buf: Vec = vec![1; 10]; + let result = read_exact_or_zero(&mut r, &mut buf).await; + assert_eq!(result.ok(), Some(10)); + assert_eq!(&buf[..], &[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]) + } + + #[tokio::test] + async fn test_eof_is_not_error() { + let mut r = ZeroReader { + remaining: 0, + max_read_len: 10, + }; + let mut buf: Vec = vec![1; 10]; + let result = read_exact_or_zero(&mut r, &mut buf).await; + assert_eq!(result.ok(), Some(0)); + assert_eq!(&buf[..], &[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]) + } + #[tokio::test] + async fn test_repeated_read_calls() { + let mut r = ZeroReader { + remaining: 10, + max_read_len: 4, + }; + let mut buf: Vec = vec![1; 10]; + let result = read_exact_or_zero(&mut r, &mut buf).await; + assert_eq!(result.ok(), Some(10)); + assert_eq!(&buf[..], &[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]) + } + #[tokio::test] + async fn test_partial_read_is_error() { + let mut r = ZeroReader { + remaining: 4, + max_read_len: 2, + }; + let mut buf: Vec = vec![1; 10]; + let result = read_exact_or_zero(&mut r, &mut buf).await; + assert!(!result.is_ok()); + assert_eq!(&buf[..], &[0, 0, 0, 0, 1, 1, 1, 1, 1, 1]); + } +} diff --git a/rust/src/write.rs b/rust/src/write.rs index 32e47a0524..3be18f9282 100644 --- a/rust/src/write.rs +++ b/rust/src/write.rs @@ -153,10 +153,7 @@ impl WriteOptions { /// If `None`, chunks will not be automatically closed and the user must call `flush()` to /// begin a new chunk. pub fn chunk_size(self, chunk_size: Option) -> Self { - Self { - chunk_size: chunk_size, - ..self - } + Self { chunk_size, ..self } } /// specifies whether to use chunks for storing messages. @@ -711,7 +708,7 @@ enum Compressor { #[cfg(feature = "zstd")] Zstd(zstd::Encoder<'static, W>), #[cfg(feature = "lz4")] - Lz4(lz4_flex::frame::FrameEncoder), + Lz4(lz4::Encoder), } impl Compressor { @@ -721,7 +718,11 @@ impl Compressor { #[cfg(feature = "zstd")] Compressor::Zstd(w) => w.finish()?, #[cfg(feature = "lz4")] - Compressor::Lz4(w) => w.finish()?, + Compressor::Lz4(w) => { + let (output, result) = w.finish(); + result?; + output + } }) } } @@ -797,7 +798,7 @@ impl ChunkWriter { Compressor::Zstd(enc) } #[cfg(feature = "lz4")] - Some(Compression::Lz4) => Compressor::Lz4(lz4_flex::frame::FrameEncoder::new(writer)), + Some(Compression::Lz4) => Compressor::Lz4(lz4::EncoderBuilder::new().build(writer)?), #[cfg(not(any(feature = "zstd", feature = "lz4")))] Some(_) => unreachable!("`Compression` is an empty enum that cannot be instantiated"), None => Compressor::Null(writer), diff --git a/rust/tests/attachment.rs b/rust/tests/attachment.rs index 76b6762777..57eb98b1eb 100644 --- a/rust/tests/attachment.rs +++ b/rust/tests/attachment.rs @@ -66,7 +66,8 @@ fn round_trip() -> Result<()> { ..Default::default() }), attachment_indexes: vec![mcap::records::AttachmentIndex { - offset: 38, // Finicky - depends on the length of the library version string + // offset depends on the length of the embedded library string, which includes the crate version + offset: 33 + (env!("CARGO_PKG_VERSION").len() as u64), length: 78, log_time: 2, create_time: 1, diff --git a/rust/tests/metadata.rs b/rust/tests/metadata.rs index 606d81e791..905ab98ff7 100644 --- a/rust/tests/metadata.rs +++ b/rust/tests/metadata.rs @@ -56,7 +56,8 @@ fn round_trip() -> Result<()> { ..Default::default() }), metadata_indexes: vec![mcap::records::MetadataIndex { - offset: 38, // Finicky - depends on the length of the library version string + // offset depends on the length of the embedded library string, which includes the crate version + offset: 33 + (env!("CARGO_PKG_VERSION").len() as u64), length: 41, name: String::from("myMetadata"), }], diff --git a/tests/conformance/scripts/run-tests/runners/RustAsyncReaderTestRunner.ts b/tests/conformance/scripts/run-tests/runners/RustAsyncReaderTestRunner.ts new file mode 100644 index 0000000000..8418d62aed --- /dev/null +++ b/tests/conformance/scripts/run-tests/runners/RustAsyncReaderTestRunner.ts @@ -0,0 +1,22 @@ +import { exec } from "child_process"; +import { join } from "path"; +import { promisify } from "util"; +import { TestVariant } from "variants/types"; + +import { StreamedReadTestRunner } from "./TestRunner"; +import { StreamedReadTestResult } from "../types"; + +export default class RustAsyncReaderTestRunner extends StreamedReadTestRunner { + readonly name = "rust-async-streamed-reader"; + + async runReadTest(filePath: string): Promise { + const { stdout } = await promisify(exec)(`./conformance_reader_async ${filePath}`, { + cwd: join(__dirname, "../../../../../rust/target/debug/examples"), + }); + return JSON.parse(stdout.trim()) as StreamedReadTestResult; + } + + supportsVariant(_variant: TestVariant): boolean { + return true; + } +} diff --git a/tests/conformance/scripts/run-tests/runners/index.ts b/tests/conformance/scripts/run-tests/runners/index.ts index 3c76f02148..6af475da47 100644 --- a/tests/conformance/scripts/run-tests/runners/index.ts +++ b/tests/conformance/scripts/run-tests/runners/index.ts @@ -8,6 +8,7 @@ import KaitaiStructReaderTestRunner from "./KaitaiStructReaderTestRunner"; import PythonIndexedReaderTestRunner from "./PythonIndexedReaderTestRunner"; import PythonStreamedReaderTestRunner from "./PythonStreamedReaderTestRunner"; import PythonWriterTestRunner from "./PythonWriterTestRunner"; +import RustAsyncReaderTestRunner from "./RustAsyncReaderTestRunner"; import RustReaderTestRunner from "./RustReaderTestRunner"; import RustWriterTestRunner from "./RustWriterTestRunner"; import SwiftIndexedReaderTestRunner from "./SwiftIndexedReaderTestRunner"; @@ -31,6 +32,7 @@ const runners: readonly (IndexedReadTestRunner | StreamedReadTestRunner | WriteT new TypescriptIndexedReaderTestRunner(), new TypescriptStreamedReaderTestRunner(), new TypescriptWriterTestRunner(), + new RustAsyncReaderTestRunner(), new RustReaderTestRunner(), new RustWriterTestRunner(), new SwiftWriterTestRunner(), From e591defaa95186cef27e37c49fa7e1f0c9f2e8a6 Mon Sep 17 00:00:00 2001 From: james-rms Date: Tue, 17 Sep 2024 09:59:15 +1000 Subject: [PATCH 37/44] go: indexed message iterator: guard against bad offsets in file (#1234) ### Changelog - if a file contains offsets that point outside of file bounds, the Go indexed reader implementation will return `mcap.ErrBadOffset` when attempting to use them. ### Docs None. ### Description There are files being uploaded to Foxglove that have invalid offsets inside them - these cause errors in `io.Seek`, which can't be told apart from true IO errors at the moment. This means if a read fails, we can't tell if it's because of a corrupt file or flaky I/O. This PR lets us disambiguate by returning a specific error from the library.
BeforeAfter
--- go/mcap/indexed_message_iterator.go | 30 ++++++++++++++++++++++++----- go/mcap/reader_test.go | 26 ++++++++++++++++++++++++- 2 files changed, 50 insertions(+), 6 deletions(-) diff --git a/go/mcap/indexed_message_iterator.go b/go/mcap/indexed_message_iterator.go index ba06c4e521..c5a7176623 100644 --- a/go/mcap/indexed_message_iterator.go +++ b/go/mcap/indexed_message_iterator.go @@ -4,16 +4,21 @@ import ( "bufio" "bytes" "encoding/binary" + "errors" "fmt" "io" "math/bits" "slices" "sort" + "math" + "github.com/klauspost/compress/zstd" "github.com/pierrec/lz4/v4" ) +var ErrBadOffset = errors.New("invalid offset") + const ( chunkBufferGrowthMultiple = 1.2 ) @@ -54,6 +59,7 @@ type indexedMessageIterator struct { attachmentIndexes []*AttachmentIndex metadataIndexes []*MetadataIndex footer *Footer + fileSize int64 curChunkIndex int messageIndexes []messageIndexWithChunkSlot @@ -68,14 +74,28 @@ type indexedMessageIterator struct { metadataCallback func(*Metadata) error } +func (it *indexedMessageIterator) seekTo(offset uint64) error { + if offset > uint64(math.MaxInt64) { + return fmt.Errorf("%w: %d > int64 max", ErrBadOffset, offset) + } + signedOffset := int64(offset) + if signedOffset >= it.fileSize { + return fmt.Errorf("%w: %d past file end %d", ErrBadOffset, offset, it.fileSize) + } + _, err := it.rs.Seek(signedOffset, io.SeekStart) + return err +} + // parseIndexSection parses the index section of the file and populates the // related fields of the structure. It must be called prior to any of the other // access methods. func (it *indexedMessageIterator) parseSummarySection() error { - _, err := it.rs.Seek(-8-4-8-8, io.SeekEnd) // magic, plus 20 bytes footer + const footerStartOffsetFromEnd = 8 + 4 + 8 + 8 // magic, plus 20 bytes footer + footerStartPos, err := it.rs.Seek(-footerStartOffsetFromEnd, io.SeekEnd) if err != nil { return err } + it.fileSize = footerStartPos + footerStartOffsetFromEnd buf := make([]byte, 8+20) _, err = io.ReadFull(it.rs, buf) if err != nil { @@ -96,9 +116,9 @@ func (it *indexedMessageIterator) parseSummarySection() error { it.hasReadSummarySection = true return nil } - _, err = it.rs.Seek(int64(footer.SummaryStart), io.SeekStart) + err = it.seekTo(footer.SummaryStart) if err != nil { - return fmt.Errorf("failed to seek to summary start") + return fmt.Errorf("failed to seek to summary start: %w", err) } lexer, err := NewLexer(bufio.NewReader(it.rs), &LexerOptions{ @@ -188,7 +208,7 @@ func (it *indexedMessageIterator) parseSummarySection() error { // loadChunk seeks to and decompresses a chunk into a chunk slot, then populates it.messageIndexes // with the offsets of messages in that chunk. func (it *indexedMessageIterator) loadChunk(chunkIndex *ChunkIndex) error { - _, err := it.rs.Seek(int64(chunkIndex.ChunkStartOffset), io.SeekStart) + err := it.seekTo(chunkIndex.ChunkStartOffset) if err != nil { return err } @@ -377,7 +397,7 @@ func (it *indexedMessageIterator) NextInto(msg *Message) (*Schema, *Channel, *Me // take care of the metadata here if it.metadataCallback != nil { for _, idx := range it.metadataIndexes { - _, err := it.rs.Seek(int64(idx.Offset), io.SeekStart) + err := it.seekTo(idx.Offset) if err != nil { return nil, nil, nil, fmt.Errorf("failed to seek to metadata: %w", err) } diff --git a/go/mcap/reader_test.go b/go/mcap/reader_test.go index e5bb35f15f..e1322f16a7 100644 --- a/go/mcap/reader_test.go +++ b/go/mcap/reader_test.go @@ -908,7 +908,6 @@ func TestOrderStableWithEquivalentTimestamps(t *testing.T) { } assert.Equal(t, uint64(0), msg.LogTime) msgNumber := binary.LittleEndian.Uint64(msg.Data) - fmt.Printf("msgNumber: %d\n", msgNumber) if numRead != 0 { assert.Less(t, msgNumber, lastMessageNumber) } @@ -1126,3 +1125,28 @@ func BenchmarkReader(b *testing.B) { }) } } + +func TestFooterOffsetErrorDetected(t *testing.T) { + buf := &bytes.Buffer{} + writer, err := NewWriter(buf, &WriterOptions{ + Chunked: true, + ChunkSize: 1024, + Compression: "", + }) + require.NoError(t, err) + require.NoError(t, writer.WriteHeader(&Header{})) + require.NoError(t, writer.WriteChannel(&Channel{ID: 1})) + require.NoError(t, writer.WriteMessage(&Message{ChannelID: 1})) + require.NoError(t, writer.Close()) + + // break the footer summary offset field. This is 8 + 8 + 4 + 8 bytes from end of file. + mcapBytes := buf.Bytes() + end := len(mcapBytes) + binary.LittleEndian.PutUint64(mcapBytes[end-8-8-4-8:], 999999999) + + reader, err := NewReader(bytes.NewReader(mcapBytes)) + require.NoError(t, err) + + _, err = reader.Info() + require.ErrorIs(t, err, ErrBadOffset) +} From 0f156d95a95debe88c0ccb49c8be64f6ea4547fb Mon Sep 17 00:00:00 2001 From: james-rms Date: Wed, 18 Sep 2024 11:02:22 +1000 Subject: [PATCH 38/44] bump go version to v1.6.0 (#1237) ### Changelog ### Docs ### Description
BeforeAfter
--- go/mcap/version.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/go/mcap/version.go b/go/mcap/version.go index 0f69efec2c..21e0c3e3ea 100644 --- a/go/mcap/version.go +++ b/go/mcap/version.go @@ -1,4 +1,4 @@ package mcap // Version of the MCAP library. -var Version = "v1.5.0" +var Version = "v1.6.0" From 8f35ee03b5921de869e2e9ac3475689f731cab1a Mon Sep 17 00:00:00 2001 From: Hans-Joachim Krauch Date: Fri, 20 Sep 2024 21:15:16 +0200 Subject: [PATCH 39/44] TypeScript: Reuse Reader for stream reader (#1236) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ### Changelog Typescript: Performance improvements for readers ### Docs None ### Description Reintroduces the changes made in #1212, which have been reverted in #1227, with an additional fix & test (separate commit) for the buffer append logic. The original code of #1212 removed the `streamBuffer` object and implemented the buffer append ing logic directly in the `McapStreamReader` class. The append logic had a bug which caused existing data to be partially overridden by new data. From #1212: > Reduces `McapStreamReader` heap usage by ~25% and boosts throughput by ~30%. > > This is both a refactor and perf boost, with more room for improvement, key changes: > - Removes StreamBuffer, hoisted into McapStreamReader > - Reuses Reader and DataView class across parse calls, only resetting them when necessary (e.g: append in McapStreamReader) > - Splits `parseRecord` into small scoped parsing functions, this itself is perf neutral (slightly positive) but facilitates future monomorphic fast paths > - Moves offsets tracking into Reader which is cleaner and faster > > ### Before > > ``` > McapStreamReader > 3.48±0.03 op/s Heap Used: 49.56±12.75 MB/op Heap Total: 41.47±11.83 MB/op ArrayBuffers: 112.95±6.87 MB/op > McapIndexedReader > 2.15±0.02 op/s Heap Used: 70.02±2.84 MB/op Heap Total: 58.34±3.36 MB/op ArrayBuffers: 17.86±0.76 MB/op > McapIndexedReader_reverse > 2.18±0.01 op/s Heap Used: 59.92±2.86 MB/op Heap Total: 39.81±1.00 MB/op ArrayBuffers: 14.58±1.42 MB/op > ``` > > ### After > > ``` > McapStreamReader > 4.47±0.08 op/s Heap Used: 42.35±2.23 MB/op Heap Total: 32.93±3.76 MB/op ArrayBuffers: 105.93±12.19 MB/op > McapIndexedReader > 2.38±0.02 op/s Heap Used: 72.00±1.70 MB/op Heap Total: 55.12±2.51 MB/op ArrayBuffers: 17.86±1.85 MB/op > McapIndexedReader_reverse > 2.38±0.02 op/s Heap Used: 63.41±1.55 MB/op Heap Total: 39.33±0.53 MB/op ArrayBuffers: 18.40±1.60 MB/op > ``` --------- Co-authored-by: Aaron O'Mullan --- typescript/core/src/ChunkCursor.ts | 25 +- typescript/core/src/McapIndexedReader.ts | 159 ++-- typescript/core/src/McapStreamReader.test.ts | 61 ++ typescript/core/src/McapStreamReader.ts | 121 ++-- typescript/core/src/McapWriter.test.ts | 12 +- typescript/core/src/Reader.ts | 38 +- typescript/core/src/StreamBuffer.test.ts | 47 -- typescript/core/src/StreamBuffer.ts | 58 -- typescript/core/src/parse.ts | 717 ++++++++++--------- 9 files changed, 638 insertions(+), 600 deletions(-) delete mode 100644 typescript/core/src/StreamBuffer.test.ts delete mode 100644 typescript/core/src/StreamBuffer.ts diff --git a/typescript/core/src/ChunkCursor.ts b/typescript/core/src/ChunkCursor.ts index 3a33303010..2113cc2b42 100644 --- a/typescript/core/src/ChunkCursor.ts +++ b/typescript/core/src/ChunkCursor.ts @@ -1,3 +1,4 @@ +import Reader from "./Reader"; import { parseRecord } from "./parse"; import { sortedIndexBy } from "./sortedIndexBy"; import { sortedLastIndexBy } from "./sortedLastIndex"; @@ -136,31 +137,25 @@ export class ChunkCursor { messageIndexes.byteLength, ); - let offset = 0; + const reader = new Reader(messageIndexesView); const arrayOfMessageOffsets: [logTime: bigint, offset: bigint][][] = []; - for ( - let result; - (result = parseRecord({ view: messageIndexesView, startOffset: offset, validateCrcs: true })), - result.record; - offset += result.usedBytes - ) { - if (result.record.type !== "MessageIndex") { + let record; + while ((record = parseRecord(reader, true))) { + if (record.type !== "MessageIndex") { continue; } if ( - result.record.records.length === 0 || - (this.#relevantChannels && !this.#relevantChannels.has(result.record.channelId)) + record.records.length === 0 || + (this.#relevantChannels && !this.#relevantChannels.has(record.channelId)) ) { continue; } - arrayOfMessageOffsets.push(result.record.records); + arrayOfMessageOffsets.push(record.records); } - if (offset !== messageIndexesView.byteLength) { - throw new Error( - `${messageIndexesView.byteLength - offset} bytes remaining in message index section`, - ); + if (reader.bytesRemaining() !== 0) { + throw new Error(`${reader.bytesRemaining()} bytes remaining in message index section`); } this.#orderedMessageOffsets = arrayOfMessageOffsets diff --git a/typescript/core/src/McapIndexedReader.ts b/typescript/core/src/McapIndexedReader.ts index 5955300a40..51ec00044e 100644 --- a/typescript/core/src/McapIndexedReader.ts +++ b/typescript/core/src/McapIndexedReader.ts @@ -2,6 +2,7 @@ import { crc32, crc32Final, crc32Init, crc32Update } from "@foxglove/crc"; import Heap from "heap-js"; import { ChunkCursor } from "./ChunkCursor"; +import Reader from "./Reader"; import { MCAP_MAGIC } from "./constants"; import { parseMagic, parseRecord } from "./parse"; import { DecompressHandlers, IReadable, TypedMcapRecords } from "./types"; @@ -111,7 +112,7 @@ export class McapIndexedReader { headerPrefix.byteOffset, headerPrefix.byteLength, ); - void parseMagic(headerPrefixView, 0); + void parseMagic(new Reader(headerPrefixView)); const headerContentLength = headerPrefixView.getBigUint64( MCAP_MAGIC.length + /* Opcode.HEADER */ 1, true, @@ -121,26 +122,19 @@ export class McapIndexedReader { const headerRecord = await readable.read(BigInt(MCAP_MAGIC.length), headerReadLength); headerEndOffset = BigInt(MCAP_MAGIC.length) + headerReadLength; - const headerResult = parseRecord({ - view: new DataView(headerRecord.buffer, headerRecord.byteOffset, headerRecord.byteLength), - startOffset: 0, - validateCrcs: true, - }); - if (headerResult.record?.type !== "Header") { + const headerReader = new Reader( + new DataView(headerRecord.buffer, headerRecord.byteOffset, headerRecord.byteLength), + ); + const headerResult = parseRecord(headerReader, true); + if (headerResult?.type !== "Header") { throw new Error( - `Unable to read header at beginning of file; found ${ - headerResult.record?.type ?? "nothing" - }`, + `Unable to read header at beginning of file; found ${headerResult?.type ?? "nothing"}`, ); } - if (headerResult.usedBytes !== headerRecord.byteLength) { - throw new Error( - `${ - headerRecord.byteLength - headerResult.usedBytes - } bytes remaining after parsing header`, - ); + if (headerReader.bytesRemaining() !== 0) { + throw new Error(`${headerReader.bytesRemaining()} bytes remaining after parsing header`); } - header = headerResult.record; + header = headerResult; } function errorWithLibrary(message: string): Error { @@ -179,33 +173,32 @@ export class McapIndexedReader { } try { - void parseMagic(footerAndMagicView, footerAndMagicView.byteLength - MCAP_MAGIC.length); + void parseMagic( + new Reader(footerAndMagicView, footerAndMagicView.byteLength - MCAP_MAGIC.length), + ); } catch (error) { throw errorWithLibrary((error as Error).message); } let footer: TypedMcapRecords["Footer"]; { - const footerResult = parseRecord({ - view: footerAndMagicView, - startOffset: 0, - validateCrcs: true, - }); - if (footerResult.record?.type !== "Footer") { + const footerReader = new Reader(footerAndMagicView); + const footerRecord = parseRecord(footerReader, true); + if (footerRecord?.type !== "Footer") { throw errorWithLibrary( `Unable to read footer from end of file (offset ${footerOffset}); found ${ - footerResult.record?.type ?? "nothing" + footerRecord?.type ?? "nothing" }`, ); } - if (footerResult.usedBytes !== footerAndMagicView.byteLength - MCAP_MAGIC.length) { + if (footerReader.bytesRemaining() !== MCAP_MAGIC.length) { throw errorWithLibrary( `${ - footerAndMagicView.byteLength - MCAP_MAGIC.length - footerResult.usedBytes + footerReader.bytesRemaining() - MCAP_MAGIC.length } bytes remaining after parsing footer`, ); } - footer = footerResult.record; + footer = footerRecord; } if (footer.summaryStart === 0n) { throw errorWithLibrary("File is not indexed"); @@ -261,6 +254,7 @@ export class McapIndexedReader { dataEndAndSummarySection.byteOffset, dataEndAndSummarySection.byteLength, ); + const indexReader = new Reader(indexView); const channelsById = new Map(); const schemasById = new Map(); @@ -271,46 +265,42 @@ export class McapIndexedReader { let statistics: TypedMcapRecords["Statistics"] | undefined; let dataSectionCrc: number | undefined; - let offset = 0; - for ( - let result; - (result = parseRecord({ view: indexView, startOffset: offset, validateCrcs: true })), - result.record; - offset += result.usedBytes - ) { - if (offset === 0 && result.record.type !== "DataEnd") { + let first = true; + let result; + while ((result = parseRecord(indexReader, true))) { + if (first && result.type !== "DataEnd") { throw errorWithLibrary( - `Expected DataEnd record to precede summary section, but found ${result.record.type}`, + `Expected DataEnd record to precede summary section, but found ${result.type}`, ); } - switch (result.record.type) { + first = false; + switch (result.type) { case "Schema": - schemasById.set(result.record.id, result.record); + schemasById.set(result.id, result); break; case "Channel": - channelsById.set(result.record.id, result.record); + channelsById.set(result.id, result); break; case "ChunkIndex": - chunkIndexes.push(result.record); + chunkIndexes.push(result); break; case "AttachmentIndex": - attachmentIndexes.push(result.record); + attachmentIndexes.push(result); break; case "MetadataIndex": - metadataIndexes.push(result.record); + metadataIndexes.push(result); break; case "Statistics": if (statistics) { throw errorWithLibrary("Duplicate Statistics record"); } - statistics = result.record; + statistics = result; break; case "SummaryOffset": - summaryOffsetsByOpcode.set(result.record.groupOpcode, result.record); + summaryOffsetsByOpcode.set(result.groupOpcode, result); break; case "DataEnd": - dataSectionCrc = - result.record.dataSectionCrc === 0 ? undefined : result.record.dataSectionCrc; + dataSectionCrc = result.dataSectionCrc === 0 ? undefined : result.dataSectionCrc; break; case "Header": case "Footer": @@ -319,13 +309,13 @@ export class McapIndexedReader { case "MessageIndex": case "Attachment": case "Metadata": - throw errorWithLibrary(`${result.record.type} record not allowed in index section`); + throw errorWithLibrary(`${result.type} record not allowed in index section`); case "Unknown": break; } } - if (offset !== indexView.byteLength) { - throw errorWithLibrary(`${indexView.byteLength - offset} bytes remaining in index section`); + if (indexReader.bytesRemaining() !== 0) { + throw errorWithLibrary(`${indexReader.bytesRemaining()} bytes remaining in index section`); } return new McapIndexedReader({ @@ -395,6 +385,7 @@ export class McapIndexedReader { // cursor becomes active (i.e. when we first need to access messages from the chunk) and removed // when the cursor is removed from the heap. const chunkViewCache = new Map(); + const chunkReader = new Reader(new DataView(new ArrayBuffer(0))); for (let cursor; (cursor = chunkCursors.peek()); ) { if (!cursor.hasMessageIndexes()) { // If we encounter a chunk whose message indexes have not been loaded yet, load them and re-organize the heap. @@ -421,27 +412,24 @@ export class McapIndexedReader { `Message offset beyond chunk bounds (log time ${logTime}, offset ${offset}, chunk data length ${chunkView.byteLength}) in chunk at offset ${cursor.chunkIndex.chunkStartOffset}`, ); } - const result = parseRecord({ - view: chunkView, - startOffset: Number(offset), - validateCrcs: validateCrcs ?? true, - }); - if (!result.record) { + chunkReader.reset(chunkView, Number(offset)); + const record = parseRecord(chunkReader, validateCrcs ?? true); + if (!record) { throw this.#errorWithLibrary( `Unable to parse record at offset ${offset} in chunk at offset ${cursor.chunkIndex.chunkStartOffset}`, ); } - if (result.record.type !== "Message") { + if (record.type !== "Message") { throw this.#errorWithLibrary( - `Unexpected record type ${result.record.type} in message index (time ${logTime}, offset ${offset} in chunk at offset ${cursor.chunkIndex.chunkStartOffset})`, + `Unexpected record type ${record.type} in message index (time ${logTime}, offset ${offset} in chunk at offset ${cursor.chunkIndex.chunkStartOffset})`, ); } - if (result.record.logTime !== logTime) { + if (record.logTime !== logTime) { throw this.#errorWithLibrary( - `Message log time ${result.record.logTime} did not match message index entry (${logTime} at offset ${offset} in chunk at offset ${cursor.chunkIndex.chunkStartOffset})`, + `Message log time ${record.logTime} did not match message index entry (${logTime} at offset ${offset} in chunk at offset ${cursor.chunkIndex.chunkStartOffset})`, ); } - yield result.record; + yield record; if (cursor.hasMoreMessages()) { // There is no need to reorganize the heap when chunks are ordered and not overlapping. @@ -468,19 +456,18 @@ export class McapIndexedReader { continue; } const metadataData = await this.#readable.read(metadataIndex.offset, metadataIndex.length); - const metadataResult = parseRecord({ - view: new DataView(metadataData.buffer, metadataData.byteOffset, metadataData.byteLength), - startOffset: 0, - validateCrcs: false, - }); - if (metadataResult.record?.type !== "Metadata") { + const metadataReader = new Reader( + new DataView(metadataData.buffer, metadataData.byteOffset, metadataData.byteLength), + ); + const metadataRecord = parseRecord(metadataReader, false); + if (metadataRecord?.type !== "Metadata") { throw this.#errorWithLibrary( `Metadata data at offset ${ metadataIndex.offset - } does not point to metadata record (found ${String(metadataResult.record?.type)})`, + } does not point to metadata record (found ${String(metadataRecord?.type)})`, ); } - yield metadataResult.record; + yield metadataRecord; } } @@ -519,23 +506,18 @@ export class McapIndexedReader { attachmentIndex.offset, attachmentIndex.length, ); - const attachmentResult = parseRecord({ - view: new DataView( - attachmentData.buffer, - attachmentData.byteOffset, - attachmentData.byteLength, - ), - startOffset: 0, - validateCrcs: validateCrcs ?? true, - }); - if (attachmentResult.record?.type !== "Attachment") { + const attachmentReader = new Reader( + new DataView(attachmentData.buffer, attachmentData.byteOffset, attachmentData.byteLength), + ); + const attachmentRecord = parseRecord(attachmentReader, validateCrcs ?? true); + if (attachmentRecord?.type !== "Attachment") { throw this.#errorWithLibrary( `Attachment data at offset ${ attachmentIndex.offset - } does not point to attachment record (found ${String(attachmentResult.record?.type)})`, + } does not point to attachment record (found ${String(attachmentRecord?.type)})`, ); } - yield attachmentResult.record; + yield attachmentRecord; } } @@ -547,20 +529,19 @@ export class McapIndexedReader { chunkIndex.chunkStartOffset, chunkIndex.chunkLength, ); - const chunkResult = parseRecord({ - view: new DataView(chunkData.buffer, chunkData.byteOffset, chunkData.byteLength), - startOffset: 0, - validateCrcs: options?.validateCrcs ?? true, - }); - if (chunkResult.record?.type !== "Chunk") { + const chunkReader = new Reader( + new DataView(chunkData.buffer, chunkData.byteOffset, chunkData.byteLength), + ); + const chunkRecord = parseRecord(chunkReader, options?.validateCrcs ?? true); + if (chunkRecord?.type !== "Chunk") { throw this.#errorWithLibrary( `Chunk start offset ${ chunkIndex.chunkStartOffset - } does not point to chunk record (found ${String(chunkResult.record?.type)})`, + } does not point to chunk record (found ${String(chunkRecord?.type)})`, ); } - const chunk = chunkResult.record; + const chunk = chunkRecord; let buffer = chunk.records; if (chunk.compression !== "" && buffer.byteLength > 0) { const decompress = this.#decompressHandlers?.[chunk.compression]; diff --git a/typescript/core/src/McapStreamReader.test.ts b/typescript/core/src/McapStreamReader.test.ts index 2d392979c3..4b5bdde719 100644 --- a/typescript/core/src/McapStreamReader.test.ts +++ b/typescript/core/src/McapStreamReader.test.ts @@ -1,5 +1,6 @@ import { crc32 } from "@foxglove/crc"; +import { McapRecordBuilder } from "./McapRecordBuilder"; import McapStreamReader from "./McapStreamReader"; import { MCAP_MAGIC, Opcode } from "./constants"; import { @@ -654,4 +655,64 @@ describe("McapStreamReader", () => { }); expect(reader.done()).toBe(true); }); + + it("correctly appends new data to internal buffer", () => { + const streamReader = new McapStreamReader({ includeChunks: true, noMagicPrefix: true }); + const recordBuilder = new McapRecordBuilder(); + + const channel = { + id: 0, + messageEncoding: "json", + schemaId: 0, + topic: "foo", + metadata: new Map(), + }; + const messageSize = 1_000; + const messageRecordBytes = 1 + 8 + 2 + 4 + 8 + 8 + messageSize; + + const makeMessage = (fillNumber: number) => ({ + channelId: 0, + data: new Uint8Array(messageSize).fill(fillNumber), + logTime: 0n, + publishTime: 0n, + sequence: 0, + }); + + const channelByteSize = recordBuilder.writeChannel(channel); + streamReader.append(recordBuilder.buffer); + expect(streamReader.bytesRemaining()).toBe(Number(channelByteSize)); + expect(streamReader.nextRecord()).toEqual({ ...channel, type: "Channel" }); + expect(streamReader.bytesRemaining()).toBe(0); + + // Add some messages and append them to the reader. + recordBuilder.reset(); + recordBuilder.writeMessage(makeMessage(1)); + recordBuilder.writeMessage(makeMessage(2)); + streamReader.append(recordBuilder.buffer); + expect(streamReader.bytesRemaining()).toBe(2 * messageRecordBytes); + + // Add one more message. Nothing has been consumed yet, but the internal buffer should be + // large enough to simply append the new data. + recordBuilder.reset(); + recordBuilder.writeMessage(makeMessage(3)); + streamReader.append(recordBuilder.buffer); + expect(streamReader.bytesRemaining()).toBe(3 * messageRecordBytes); + + // Read some (but not all) messages to forward the reader's internal offset + expect(streamReader.nextRecord()).toEqual({ ...makeMessage(1), type: "Message" }); + expect(streamReader.nextRecord()).toEqual({ ...makeMessage(2), type: "Message" }); + expect(streamReader.bytesRemaining()).toBe(1 * messageRecordBytes); + + // Add more messages. This will cause existing data to be shifted to the beginning of the buffer. + recordBuilder.reset(); + recordBuilder.writeMessage(makeMessage(4)); + recordBuilder.writeMessage(makeMessage(5)); + streamReader.append(recordBuilder.buffer); + expect(streamReader.bytesRemaining()).toBe(3 * messageRecordBytes); + + expect(streamReader.nextRecord()).toEqual({ ...makeMessage(3), type: "Message" }); + expect(streamReader.nextRecord()).toEqual({ ...makeMessage(4), type: "Message" }); + expect(streamReader.nextRecord()).toEqual({ ...makeMessage(5), type: "Message" }); + expect(streamReader.bytesRemaining()).toBe(0); + }); }); diff --git a/typescript/core/src/McapStreamReader.ts b/typescript/core/src/McapStreamReader.ts index ddd412ef38..f04db47739 100644 --- a/typescript/core/src/McapStreamReader.ts +++ b/typescript/core/src/McapStreamReader.ts @@ -1,6 +1,6 @@ import { crc32 } from "@foxglove/crc"; -import StreamBuffer from "./StreamBuffer"; +import Reader from "./Reader"; import { MCAP_MAGIC } from "./constants"; import { parseMagic, parseRecord } from "./parse"; import { Channel, DecompressHandlers, McapMagic, TypedMcapRecord, TypedMcapRecords } from "./types"; @@ -50,7 +50,9 @@ type McapReaderOptions = { * ``` */ export default class McapStreamReader { - #buffer = new StreamBuffer(MCAP_MAGIC.length * 2); + #buffer = new ArrayBuffer(MCAP_MAGIC.length * 2); + #view = new DataView(this.#buffer, 0, 0); + #reader = new Reader(this.#view); #decompressHandlers; #includeChunks; #validateCrcs; @@ -78,7 +80,7 @@ export default class McapStreamReader { /** @returns The number of bytes that have been received by `append()` but not yet parsed. */ bytesRemaining(): number { - return this.#buffer.bytesRemaining(); + return this.#reader.bytesRemaining(); } /** @@ -89,7 +91,62 @@ export default class McapStreamReader { if (this.#doneReading) { throw new Error("Already done reading"); } - this.#buffer.append(data); + this.#appendOrShift(data); + } + + #appendOrShift(data: Uint8Array): void { + /** Add data to the buffer, shifting existing data or reallocating if necessary. */ + const consumedBytes = this.#reader.offset; + const unconsumedBytes = this.#view.byteLength - consumedBytes; + const neededCapacity = unconsumedBytes + data.byteLength; + + if (neededCapacity <= this.#buffer.byteLength) { + // Data fits in the current buffer + if ( + this.#view.byteOffset + this.#view.byteLength + data.byteLength <= + this.#buffer.byteLength + ) { + // Data fits by appending only + const array = new Uint8Array(this.#buffer, this.#view.byteOffset); + array.set(data, this.#view.byteLength); + this.#view = new DataView( + this.#buffer, + this.#view.byteOffset, + this.#view.byteLength + data.byteLength, + ); + // Reset the reader to use the new larger view. We keep the reader's previous offset as the + // view's byte offset didn't change, it only got larger. + this.#reader.reset(this.#view, this.#reader.offset); + } else { + // Data fits but requires moving existing data to start of buffer + const existingData = new Uint8Array( + this.#buffer, + this.#view.byteOffset + consumedBytes, + unconsumedBytes, + ); + const array = new Uint8Array(this.#buffer); + array.set(existingData, 0); + array.set(data, existingData.byteLength); + this.#view = new DataView(this.#buffer, 0, existingData.byteLength + data.byteLength); + this.#reader.reset(this.#view); + } + } else { + // New data doesn't fit, copy to a new buffer + + // Currently, the new buffer size may be smaller than the old size. For future optimizations, + // we could consider making the buffer size increase monotonically. + this.#buffer = new ArrayBuffer(neededCapacity * 2); + const array = new Uint8Array(this.#buffer); + const existingData = new Uint8Array( + this.#view.buffer, + this.#view.byteOffset + consumedBytes, + unconsumedBytes, + ); + array.set(existingData, 0); + array.set(data, existingData.byteLength); + this.#view = new DataView(this.#buffer, 0, existingData.byteLength + data.byteLength); + this.#reader.reset(this.#view); + } } /** @@ -129,11 +186,10 @@ export default class McapStreamReader { *#read(): Generator { if (!this.#noMagicPrefix) { - let magic: McapMagic | undefined, usedBytes: number | undefined; - while ((({ magic, usedBytes } = parseMagic(this.#buffer.view, 0)), !magic)) { + let magic: McapMagic | undefined; + while (((magic = parseMagic(this.#reader)), !magic)) { yield; } - this.#buffer.consume(usedBytes); } let header: TypedMcapRecords["Header"] | undefined; @@ -144,20 +200,10 @@ export default class McapStreamReader { for (;;) { let record; - { - let usedBytes; - while ( - (({ record, usedBytes } = parseRecord({ - view: this.#buffer.view, - startOffset: 0, - validateCrcs: this.#validateCrcs, - })), - !record) - ) { - yield; - } - this.#buffer.consume(usedBytes); + while (((record = parseRecord(this.#reader, this.#validateCrcs)), !record)) { + yield; } + switch (record.type) { case "Unknown": break; @@ -206,18 +252,10 @@ export default class McapStreamReader { } } const view = new DataView(buffer.buffer, buffer.byteOffset, buffer.byteLength); - let chunkOffset = 0; - for ( - let chunkResult; - (chunkResult = parseRecord({ - view, - startOffset: chunkOffset, - validateCrcs: this.#validateCrcs, - })), - chunkResult.record; - chunkOffset += chunkResult.usedBytes - ) { - switch (chunkResult.record.type) { + const chunkReader = new Reader(view); + let chunkRecord; + while ((chunkRecord = parseRecord(chunkReader, this.#validateCrcs))) { + switch (chunkRecord.type) { case "Unknown": break; case "Header": @@ -232,34 +270,31 @@ export default class McapStreamReader { case "MetadataIndex": case "SummaryOffset": case "DataEnd": - throw errorWithLibrary( - `${chunkResult.record.type} record not allowed inside a chunk`, - ); + throw errorWithLibrary(`${chunkRecord.type} record not allowed inside a chunk`); case "Schema": case "Channel": case "Message": - yield chunkResult.record; + yield chunkRecord; break; } } - if (chunkOffset !== buffer.byteLength) { - throw errorWithLibrary(`${buffer.byteLength - chunkOffset} bytes remaining in chunk`); + if (chunkReader.bytesRemaining() !== 0) { + throw errorWithLibrary(`${chunkReader.bytesRemaining()} bytes remaining in chunk`); } break; } case "Footer": try { - let magic, usedBytes; - while ((({ magic, usedBytes } = parseMagic(this.#buffer.view, 0)), !magic)) { + let magic; + while (((magic = parseMagic(this.#reader)), !magic)) { yield; } - this.#buffer.consume(usedBytes); } catch (error) { throw errorWithLibrary((error as Error).message); } - if (this.#buffer.bytesRemaining() !== 0) { + if (this.#reader.bytesRemaining() !== 0) { throw errorWithLibrary( - `${this.#buffer.bytesRemaining()} bytes remaining after MCAP footer and trailing magic`, + `${this.#reader.bytesRemaining()} bytes remaining after MCAP footer and trailing magic`, ); } return record; diff --git a/typescript/core/src/McapWriter.test.ts b/typescript/core/src/McapWriter.test.ts index 57bdcd82fc..e16bc515d5 100644 --- a/typescript/core/src/McapWriter.test.ts +++ b/typescript/core/src/McapWriter.test.ts @@ -3,6 +3,7 @@ import { crc32 } from "@foxglove/crc"; import { McapIndexedReader } from "./McapIndexedReader"; import McapStreamReader from "./McapStreamReader"; import { McapWriter } from "./McapWriter"; +import Reader from "./Reader"; import { TempBuffer } from "./TempBuffer"; import { MCAP_MAGIC, Opcode } from "./constants"; import { parseMagic, parseRecord } from "./parse"; @@ -278,13 +279,12 @@ describe("McapWriter", () => { const array = tempBuffer.get(); const view = new DataView(array.buffer, array.byteOffset, array.byteLength); + const reader = new Reader(view); const records: TypedMcapRecord[] = []; - for ( - let offset = parseMagic(view, 0).usedBytes, result; - (result = parseRecord({ view, startOffset: offset, validateCrcs: true })), result.record; - offset += result.usedBytes - ) { - records.push(result.record); + parseMagic(reader); + let result; + while ((result = parseRecord(reader, true))) { + records.push(result); } const expectedChunkData = new Uint8Array([ diff --git a/typescript/core/src/Reader.ts b/typescript/core/src/Reader.ts index fcc2887237..d0136c648b 100644 --- a/typescript/core/src/Reader.ts +++ b/typescript/core/src/Reader.ts @@ -7,13 +7,27 @@ const textDecoder = new TextDecoder(); export default class Reader { #view: DataView; + #viewU8: Uint8Array; offset: number; constructor(view: DataView, offset = 0) { this.#view = view; + this.#viewU8 = new Uint8Array(view.buffer, view.byteOffset, view.byteLength); this.offset = offset; } + // Should be ~identical to the constructor, it allows us to reinitialize the reader when + // the view changes, without creating a new instance, avoiding allocation / GC overhead + reset(view: DataView, offset = 0): void { + this.#view = view; + this.#viewU8 = new Uint8Array(view.buffer, view.byteOffset, view.byteLength); + this.offset = offset; + } + + bytesRemaining(): number { + return this.#viewU8.length - this.offset; + } + uint8(): number { const value = this.#view.getUint8(this.offset); this.offset += 1; @@ -40,14 +54,12 @@ export default class Reader { string(): string { const length = this.uint32(); - if (this.offset + length > this.#view.byteLength) { + if (length === 0) { + return ""; + } else if (length > this.bytesRemaining()) { throw new Error(`String length ${length} exceeds bounds of buffer`); } - const value = textDecoder.decode( - new Uint8Array(this.#view.buffer, this.#view.byteOffset + this.offset, length), - ); - this.offset += length; - return value; + return textDecoder.decode(this.u8ArrayBorrow(length)); } keyValuePairs(readKey: (reader: Reader) => K, readValue: (reader: Reader) => V): [K, V][] { @@ -103,4 +115,18 @@ export default class Reader { } return result; } + + // Read a borrowed Uint8Array, useful temp references or borrow semantics + u8ArrayBorrow(length: number): Uint8Array { + const result = this.#viewU8.subarray(this.offset, this.offset + length); + this.offset += length; + return result; + } + + // Read a copied Uint8Array from the underlying buffer, use when you need to keep the data around + u8ArrayCopy(length: number): Uint8Array { + const result = this.#viewU8.slice(this.offset, this.offset + length); + this.offset += length; + return result; + } } diff --git a/typescript/core/src/StreamBuffer.test.ts b/typescript/core/src/StreamBuffer.test.ts deleted file mode 100644 index a45175b3e3..0000000000 --- a/typescript/core/src/StreamBuffer.test.ts +++ /dev/null @@ -1,47 +0,0 @@ -import StreamBuffer from "./StreamBuffer"; - -function toArray(view: DataView) { - return new Uint8Array(view.buffer, view.byteOffset, view.byteLength); -} - -describe("ByteStorage", () => { - it("handles basic append and consume", () => { - const buffer = new StreamBuffer(); - expect(buffer.bytesRemaining()).toBe(0); - - buffer.append(new Uint8Array([1, 2, 3])); - expect(buffer.bytesRemaining()).toBe(3); - expect(() => { - buffer.consume(4); - }).toThrow(); - - expect(toArray(buffer.view)).toEqual(new Uint8Array([1, 2, 3])); - buffer.consume(3); - expect(buffer.bytesRemaining()).toBe(0); - }); - - it("handles partial consume", () => { - const buffer = new StreamBuffer(); - - buffer.append(new Uint8Array([1, 2, 3, 4, 5])); - expect(buffer.bytesRemaining()).toBe(5); - buffer.consume(2); - expect(buffer.bytesRemaining()).toBe(3); - - expect(toArray(buffer.view)).toEqual(new Uint8Array([3, 4, 5])); - buffer.consume(3); - expect(buffer.bytesRemaining()).toBe(0); - }); - - it("reuses buffer within allocated capacity", () => { - const buffer = new StreamBuffer(5); - const rawBuffer = buffer.view.buffer; - buffer.append(new Uint8Array([1, 2])); - expect(buffer.view.buffer).toBe(rawBuffer); - buffer.append(new Uint8Array([3, 4, 5])); - expect(buffer.view.buffer).toBe(rawBuffer); - buffer.append(new Uint8Array([6, 7])); - expect(buffer.view.buffer).not.toBe(rawBuffer); - expect(toArray(buffer.view)).toEqual(new Uint8Array([1, 2, 3, 4, 5, 6, 7])); - }); -}); diff --git a/typescript/core/src/StreamBuffer.ts b/typescript/core/src/StreamBuffer.ts deleted file mode 100644 index 98eaa785d5..0000000000 --- a/typescript/core/src/StreamBuffer.ts +++ /dev/null @@ -1,58 +0,0 @@ -/** - * A growable buffer for use when processing a stream of data. - */ -export default class StreamBuffer { - #buffer: ArrayBuffer; - public view: DataView; - - constructor(initialCapacity = 0) { - this.#buffer = new ArrayBuffer(initialCapacity); - this.view = new DataView(this.#buffer, 0, 0); - } - - bytesRemaining(): number { - return this.view.byteLength; - } - - /** Mark some data as consumed, so the memory can be reused when new data is appended. */ - consume(count: number): void { - this.view = new DataView( - this.#buffer, - this.view.byteOffset + count, - this.view.byteLength - count, - ); - } - - /** Add data to the buffer, shifting existing data or reallocating if necessary. */ - append(data: Uint8Array): void { - if (this.view.byteOffset + this.view.byteLength + data.byteLength <= this.#buffer.byteLength) { - // Data fits by appending only - const array = new Uint8Array(this.view.buffer, this.view.byteOffset); - array.set(data, this.view.byteLength); - this.view = new DataView( - this.#buffer, - this.view.byteOffset, - this.view.byteLength + data.byteLength, - ); - } else if (this.view.byteLength + data.byteLength <= this.#buffer.byteLength) { - // Data fits in allocated buffer but requires moving existing data to start of buffer - const oldData = new Uint8Array(this.#buffer, this.view.byteOffset, this.view.byteLength); - const array = new Uint8Array(this.#buffer); - array.set(oldData, 0); - array.set(data, oldData.byteLength); - this.view = new DataView(this.#buffer, 0, this.view.byteLength + data.byteLength); - } else { - // New data doesn't fit, copy to a new buffer - - // Currently, the new buffer size may be smaller than the old size. For future optimizations, - // we could consider making the buffer size increase monotonically. - - const oldData = new Uint8Array(this.#buffer, this.view.byteOffset, this.view.byteLength); - this.#buffer = new ArrayBuffer((this.view.byteLength + data.byteLength) * 2); - const array = new Uint8Array(this.#buffer); - array.set(oldData, 0); - array.set(data, oldData.byteLength); - this.view = new DataView(this.#buffer, 0, this.view.byteLength + data.byteLength); - } - } -} diff --git a/typescript/core/src/parse.ts b/typescript/core/src/parse.ts index 7f2fe80285..95d0105750 100644 --- a/typescript/core/src/parse.ts +++ b/typescript/core/src/parse.ts @@ -1,374 +1,419 @@ import { crc32 } from "@foxglove/crc"; import Reader from "./Reader"; -import { isKnownOpcode, MCAP_MAGIC, Opcode } from "./constants"; +import { MCAP_MAGIC, Opcode } from "./constants"; import { McapMagic, TypedMcapRecord } from "./types"; /** * Parse a MCAP magic string at `startOffset` in `view`. */ -export function parseMagic( - view: DataView, - startOffset: number, -): { magic: McapMagic; usedBytes: number } | { magic?: undefined; usedBytes: 0 } { - if (startOffset + MCAP_MAGIC.length > view.byteLength) { - return { usedBytes: 0 }; +export function parseMagic(reader: Reader): McapMagic | undefined { + if (reader.bytesRemaining() < MCAP_MAGIC.length) { + return undefined; } - if (!MCAP_MAGIC.every((val, i) => val === view.getUint8(startOffset + i))) { + const magic = reader.u8ArrayBorrow(MCAP_MAGIC.length); + if (!MCAP_MAGIC.every((val, i) => val === magic[i])) { throw new Error( `Expected MCAP magic '${MCAP_MAGIC.map((val) => val.toString(16).padStart(2, "0")).join( " ", - )}', found '${Array.from(MCAP_MAGIC, (_, i) => - view - .getUint8(startOffset + i) - .toString(16) - .padStart(2, "0"), - ).join(" ")}'`, + )}', found '${Array.from(magic, (_, i) => magic[i]!.toString(16).padStart(2, "0")).join( + " ", + )}'`, ); } - return { - magic: { specVersion: "0" }, - usedBytes: MCAP_MAGIC.length, - }; + return { specVersion: "0" }; } /** - * Parse a MCAP record beginning at `startOffset` in `view`. + * Parse a MCAP record from the given reader */ -export function parseRecord({ - view, - startOffset, - validateCrcs, -}: { - view: DataView; - startOffset: number; - validateCrcs: boolean; -}): { record: TypedMcapRecord; usedBytes: number } | { record?: undefined; usedBytes: 0 } { - if (startOffset + /*opcode*/ 1 + /*record content length*/ 8 >= view.byteLength) { - return { usedBytes: 0 }; +// NOTE: internal function in the hot path, (de)structuring args would be wasteful, acceptable perf/clarity tradeoff +// eslint-disable-next-line @foxglove/no-boolean-parameters +export function parseRecord(reader: Reader, validateCrcs = false): TypedMcapRecord | undefined { + const RECORD_HEADER_SIZE = 1 /*opcode*/ + 8; /*record content length*/ + if (reader.bytesRemaining() < RECORD_HEADER_SIZE) { + return undefined; } - const headerReader = new Reader(view, startOffset); + const start = reader.offset; + const opcode = reader.uint8(); + const recordLength = reader.uint64(); - const opcode = headerReader.uint8(); - - const recordLength = headerReader.uint64(); if (recordLength > Number.MAX_SAFE_INTEGER) { throw new Error(`Record content length ${recordLength} is too large`); } + const recordLengthNum = Number(recordLength); - const recordEndOffset = headerReader.offset + recordLengthNum; - if (recordEndOffset > view.byteLength) { - return { usedBytes: 0 }; + + if (reader.bytesRemaining() < recordLengthNum) { + reader.offset = start; // Rewind to the start of the record + return undefined; } - if (!isKnownOpcode(opcode)) { - const data = new Uint8Array( - view.buffer, - view.byteOffset + headerReader.offset, - recordLengthNum, - ); - const record: TypedMcapRecord = { - type: "Unknown", - opcode, - data, - }; - return { record, usedBytes: recordEndOffset - startOffset }; + let result: TypedMcapRecord; + switch (opcode as Opcode) { + case Opcode.HEADER: + result = parseHeader(reader, recordLengthNum); + break; + case Opcode.FOOTER: + result = parseFooter(reader, recordLengthNum); + break; + case Opcode.SCHEMA: + result = parseSchema(reader, recordLengthNum); + break; + case Opcode.CHANNEL: + result = parseChannel(reader, recordLengthNum); + break; + case Opcode.MESSAGE: + result = parseMessage(reader, recordLengthNum); + break; + case Opcode.CHUNK: + result = parseChunk(reader, recordLengthNum); + break; + case Opcode.MESSAGE_INDEX: + result = parseMessageIndex(reader, recordLengthNum); + break; + case Opcode.CHUNK_INDEX: + result = parseChunkIndex(reader, recordLengthNum); + break; + case Opcode.ATTACHMENT: + result = parseAttachment(reader, recordLengthNum, validateCrcs); + break; + case Opcode.ATTACHMENT_INDEX: + result = parseAttachmentIndex(reader, recordLengthNum); + break; + case Opcode.STATISTICS: + result = parseStatistics(reader, recordLengthNum); + break; + case Opcode.METADATA: + result = parseMetadata(reader, recordLengthNum); + break; + case Opcode.METADATA_INDEX: + result = parseMetadataIndex(reader, recordLengthNum); + break; + case Opcode.SUMMARY_OFFSET: + result = parseSummaryOffset(reader, recordLengthNum); + break; + case Opcode.DATA_END: + result = parseDataEnd(reader, recordLengthNum); + break; + default: + result = parseUnknown(reader, recordLengthNum, opcode); + break; } - const recordView = new DataView( - view.buffer, - view.byteOffset + headerReader.offset, - recordLengthNum, + // NOTE: a bit redundant, but ensures we've advanced by the full record length + // TODO: simplify this when we explore monomorphic paths + reader.offset = start + RECORD_HEADER_SIZE + recordLengthNum; + + return result; +} + +function parseUnknown(reader: Reader, recordLength: number, opcode: number): TypedMcapRecord { + const data = reader.u8ArrayBorrow(recordLength); + return { + type: "Unknown", + opcode, + data, + }; +} + +function parseHeader(reader: Reader, recordLength: number): TypedMcapRecord { + const startOffset = reader.offset; + const profile = reader.string(); + const library = reader.string(); + reader.offset = startOffset + recordLength; + return { type: "Header", profile, library }; +} + +function parseFooter(reader: Reader, recordLength: number): TypedMcapRecord { + const startOffset = reader.offset; + const summaryStart = reader.uint64(); + const summaryOffsetStart = reader.uint64(); + const summaryCrc = reader.uint32(); + reader.offset = startOffset + recordLength; + return { + type: "Footer", + summaryStart, + summaryOffsetStart, + summaryCrc, + }; +} + +function parseSchema(reader: Reader, recordLength: number): TypedMcapRecord { + const start = reader.offset; + const id = reader.uint16(); + const name = reader.string(); + const encoding = reader.string(); + const dataLen = reader.uint32(); + const end = reader.offset; + if (recordLength - (end - start) < dataLen) { + throw new Error(`Schema data length ${dataLen} exceeds bounds of record`); + } + const data = reader.u8ArrayCopy(dataLen); + reader.offset = start + recordLength; + + return { + type: "Schema", + id, + encoding, + name, + data, + }; +} + +function parseChannel(reader: Reader, recordLength: number): TypedMcapRecord { + const startOffset = reader.offset; + const channelId = reader.uint16(); + const schemaId = reader.uint16(); + const topicName = reader.string(); + const messageEncoding = reader.string(); + const metadata = reader.map( + (r) => r.string(), + (r) => r.string(), ); - const reader = new Reader(recordView); - - switch (opcode) { - case Opcode.HEADER: { - const profile = reader.string(); - const library = reader.string(); - const record: TypedMcapRecord = { type: "Header", profile, library }; - return { record, usedBytes: recordEndOffset - startOffset }; - } + reader.offset = startOffset + recordLength; - case Opcode.FOOTER: { - const summaryStart = reader.uint64(); - const summaryOffsetStart = reader.uint64(); - const summaryCrc = reader.uint32(); - const record: TypedMcapRecord = { - type: "Footer", - summaryStart, - summaryOffsetStart, - summaryCrc, - }; - return { record, usedBytes: recordEndOffset - startOffset }; - } + return { + type: "Channel", + id: channelId, + schemaId, + topic: topicName, + messageEncoding, + metadata, + }; +} - case Opcode.SCHEMA: { - const id = reader.uint16(); - const name = reader.string(); - const encoding = reader.string(); - const dataLen = reader.uint32(); - if (reader.offset + dataLen > recordView.byteLength) { - throw new Error(`Schema data length ${dataLen} exceeds bounds of record`); - } - const data = new Uint8Array( - recordView.buffer, - recordView.byteOffset + reader.offset, - dataLen, - ).slice(); - reader.offset += dataLen; - - const record: TypedMcapRecord = { - type: "Schema", - id, - encoding, - name, - data, - }; - - return { record, usedBytes: recordEndOffset - startOffset }; - } +function parseMessage(reader: Reader, recordLength: number): TypedMcapRecord { + const MESSAGE_PREFIX_SIZE = 2 + 4 + 8 + 8; // channelId, sequence, logTime, publishTime + const channelId = reader.uint16(); + const sequence = reader.uint32(); + const logTime = reader.uint64(); + const publishTime = reader.uint64(); + const data = reader.u8ArrayCopy(recordLength - MESSAGE_PREFIX_SIZE); + return { + type: "Message", + channelId, + sequence, + logTime, + publishTime, + data, + }; +} - case Opcode.CHANNEL: { - const channelId = reader.uint16(); - const schemaId = reader.uint16(); - const topicName = reader.string(); - const messageEncoding = reader.string(); - const metadata = reader.map( - (r) => r.string(), - (r) => r.string(), - ); - - const record: TypedMcapRecord = { - type: "Channel", - id: channelId, - schemaId, - topic: topicName, - messageEncoding, - metadata, - }; - - return { record, usedBytes: recordEndOffset - startOffset }; - } +function parseChunk(reader: Reader, recordLength: number): TypedMcapRecord { + const start = reader.offset; + const startTime = reader.uint64(); + const endTime = reader.uint64(); + const uncompressedSize = reader.uint64(); + const uncompressedCrc = reader.uint32(); + const compression = reader.string(); + const recordsByteLength = Number(reader.uint64()); + const end = reader.offset; + const prefixSize = end - start; + if (recordsByteLength + prefixSize > recordLength) { + throw new Error("Chunk records length exceeds remaining record size"); + } + const records = reader.u8ArrayCopy(recordsByteLength); + reader.offset = start + recordLength; + return { + type: "Chunk", + messageStartTime: startTime, + messageEndTime: endTime, + compression, + uncompressedSize, + uncompressedCrc, + records, + }; +} - case Opcode.MESSAGE: { - const channelId = reader.uint16(); - const sequence = reader.uint32(); - const logTime = reader.uint64(); - const publishTime = reader.uint64(); - const data = new Uint8Array( - recordView.buffer, - recordView.byteOffset + reader.offset, - recordView.byteLength - reader.offset, - ).slice(); - const record: TypedMcapRecord = { - type: "Message", - channelId, - sequence, - logTime, - publishTime, - data, - }; - return { record, usedBytes: recordEndOffset - startOffset }; - } +function parseMessageIndex(reader: Reader, recordLength: number): TypedMcapRecord { + const startOffset = reader.offset; + const channelId = reader.uint16(); + const records = reader.keyValuePairs( + (r) => r.uint64(), + (r) => r.uint64(), + ); + reader.offset = startOffset + recordLength; + return { + type: "MessageIndex", + channelId, + records, + }; +} - case Opcode.CHUNK: { - const startTime = reader.uint64(); - const endTime = reader.uint64(); - const uncompressedSize = reader.uint64(); - const uncompressedCrc = reader.uint32(); - const compression = reader.string(); - const recordByteLength = Number(reader.uint64()); - if (recordByteLength + reader.offset > recordView.byteLength) { - throw new Error("Chunk records length exceeds remaining record size"); - } - const records = new Uint8Array( - recordView.buffer, - recordView.byteOffset + reader.offset, - recordByteLength, - ).slice(); - const record: TypedMcapRecord = { - type: "Chunk", - messageStartTime: startTime, - messageEndTime: endTime, - compression, - uncompressedSize, - uncompressedCrc, - records, - }; - return { record, usedBytes: recordEndOffset - startOffset }; - } +function parseChunkIndex(reader: Reader, recordLength: number): TypedMcapRecord { + const startOffset = reader.offset; + const messageStartTime = reader.uint64(); + const messageEndTime = reader.uint64(); + const chunkStartOffset = reader.uint64(); + const chunkLength = reader.uint64(); + const messageIndexOffsets = reader.map( + (r) => r.uint16(), + (r) => r.uint64(), + ); + const messageIndexLength = reader.uint64(); + const compression = reader.string(); + const compressedSize = reader.uint64(); + const uncompressedSize = reader.uint64(); + reader.offset = startOffset + recordLength; + return { + type: "ChunkIndex", + messageStartTime, + messageEndTime, + chunkStartOffset, + chunkLength, + messageIndexOffsets, + messageIndexLength, + compression, + compressedSize, + uncompressedSize, + }; +} - case Opcode.MESSAGE_INDEX: { - const channelId = reader.uint16(); - const records = reader.keyValuePairs( - (r) => r.uint64(), - (r) => r.uint64(), - ); - const record: TypedMcapRecord = { - type: "MessageIndex", - channelId, - records, - }; - return { record, usedBytes: recordEndOffset - startOffset }; - } - case Opcode.CHUNK_INDEX: { - const messageStartTime = reader.uint64(); - const messageEndTime = reader.uint64(); - const chunkStartOffset = reader.uint64(); - const chunkLength = reader.uint64(); - const messageIndexOffsets = reader.map( - (r) => r.uint16(), - (r) => r.uint64(), - ); - const messageIndexLength = reader.uint64(); - const compression = reader.string(); - const compressedSize = reader.uint64(); - const uncompressedSize = reader.uint64(); - const record: TypedMcapRecord = { - type: "ChunkIndex", - messageStartTime, - messageEndTime, - chunkStartOffset, - chunkLength, - messageIndexOffsets, - messageIndexLength, - compression, - compressedSize, - uncompressedSize, - }; - return { record, usedBytes: recordEndOffset - startOffset }; - } - case Opcode.ATTACHMENT: { - const logTime = reader.uint64(); - const createTime = reader.uint64(); - const name = reader.string(); - const mediaType = reader.string(); - const dataLen = reader.uint64(); - if (BigInt(recordView.byteOffset + reader.offset) + dataLen > Number.MAX_SAFE_INTEGER) { - throw new Error(`Attachment too large: ${dataLen}`); - } - if (reader.offset + Number(dataLen) + 4 /*crc*/ > recordView.byteLength) { - throw new Error(`Attachment data length ${dataLen} exceeds bounds of record`); - } - const data = new Uint8Array( - recordView.buffer, - recordView.byteOffset + reader.offset, - Number(dataLen), - ).slice(); - reader.offset += Number(dataLen); - const crcLength = reader.offset; - const expectedCrc = reader.uint32(); - if (validateCrcs && expectedCrc !== 0) { - const actualCrc = crc32(new DataView(recordView.buffer, recordView.byteOffset, crcLength)); - if (actualCrc !== expectedCrc) { - throw new Error( - `Attachment CRC32 mismatch: expected ${expectedCrc}, actual ${actualCrc}`, - ); - } - } - - const record: TypedMcapRecord = { - type: "Attachment", - logTime, - createTime, - name, - mediaType, - data, - }; - return { record, usedBytes: recordEndOffset - startOffset }; - } - case Opcode.ATTACHMENT_INDEX: { - const offset = reader.uint64(); - const length = reader.uint64(); - const logTime = reader.uint64(); - const createTime = reader.uint64(); - const dataSize = reader.uint64(); - const name = reader.string(); - const mediaType = reader.string(); - - const record: TypedMcapRecord = { - type: "AttachmentIndex", - offset, - length, - logTime, - createTime, - dataSize, - name, - mediaType, - }; - return { record, usedBytes: recordEndOffset - startOffset }; - } - case Opcode.STATISTICS: { - const messageCount = reader.uint64(); - const schemaCount = reader.uint16(); - const channelCount = reader.uint32(); - const attachmentCount = reader.uint32(); - const metadataCount = reader.uint32(); - const chunkCount = reader.uint32(); - const messageStartTime = reader.uint64(); - const messageEndTime = reader.uint64(); - const channelMessageCounts = reader.map( - (r) => r.uint16(), - (r) => r.uint64(), - ); - - const record: TypedMcapRecord = { - type: "Statistics", - messageCount, - schemaCount, - channelCount, - attachmentCount, - metadataCount, - chunkCount, - messageStartTime, - messageEndTime, - channelMessageCounts, - }; - return { record, usedBytes: recordEndOffset - startOffset }; - } - case Opcode.METADATA: { - const name = reader.string(); - const metadata = reader.map( - (r) => r.string(), - (r) => r.string(), - ); - const record: TypedMcapRecord = { type: "Metadata", metadata, name }; - return { record, usedBytes: recordEndOffset - startOffset }; - } - case Opcode.METADATA_INDEX: { - const offset = reader.uint64(); - const length = reader.uint64(); - const name = reader.string(); - - const record: TypedMcapRecord = { - type: "MetadataIndex", - offset, - length, - name, - }; - return { record, usedBytes: recordEndOffset - startOffset }; - } - case Opcode.SUMMARY_OFFSET: { - const groupOpcode = reader.uint8(); - const groupStart = reader.uint64(); - const groupLength = reader.uint64(); - - const record: TypedMcapRecord = { - type: "SummaryOffset", - groupOpcode, - groupStart, - groupLength, - }; - return { record, usedBytes: recordEndOffset - startOffset }; - } - case Opcode.DATA_END: { - const dataSectionCrc = reader.uint32(); - const record: TypedMcapRecord = { - type: "DataEnd", - dataSectionCrc, - }; - return { record, usedBytes: recordEndOffset - startOffset }; +function parseAttachment( + reader: Reader, + recordLength: number, + // NOTE: internal function in the hot path, (de)structuring args would be wasteful, acceptable perf/clarity tradeoff + // eslint-disable-next-line @foxglove/no-boolean-parameters + validateCrcs: boolean, +): TypedMcapRecord { + const startOffset = reader.offset; + const logTime = reader.uint64(); + const createTime = reader.uint64(); + const name = reader.string(); + const mediaType = reader.string(); + const dataLen = reader.uint64(); + // NOTE: probably not necessary, but just in case + if (BigInt(reader.offset) + dataLen > Number.MAX_SAFE_INTEGER) { + throw new Error(`Attachment too large: ${dataLen}`); + } + if (reader.offset + Number(dataLen) + 4 /*crc*/ > startOffset + recordLength) { + throw new Error(`Attachment data length ${dataLen} exceeds bounds of record`); + } + const data = reader.u8ArrayCopy(Number(dataLen)); + const crcLength = reader.offset - startOffset; + const expectedCrc = reader.uint32(); + if (validateCrcs && expectedCrc !== 0) { + reader.offset = startOffset; + const fullData = reader.u8ArrayBorrow(crcLength); + const actualCrc = crc32(fullData); + reader.offset = startOffset + crcLength + 4; + if (actualCrc !== expectedCrc) { + throw new Error(`Attachment CRC32 mismatch: expected ${expectedCrc}, actual ${actualCrc}`); } } + reader.offset = startOffset + recordLength; + + return { + type: "Attachment", + logTime, + createTime, + name, + mediaType, + data, + }; +} + +function parseAttachmentIndex(reader: Reader, recordLength: number): TypedMcapRecord { + const startOffset = reader.offset; + const offset = reader.uint64(); + const length = reader.uint64(); + const logTime = reader.uint64(); + const createTime = reader.uint64(); + const dataSize = reader.uint64(); + const name = reader.string(); + const mediaType = reader.string(); + reader.offset = startOffset + recordLength; + + return { + type: "AttachmentIndex", + offset, + length, + logTime, + createTime, + dataSize, + name, + mediaType, + }; +} + +function parseStatistics(reader: Reader, recordLength: number): TypedMcapRecord { + const startOffset = reader.offset; + const messageCount = reader.uint64(); + const schemaCount = reader.uint16(); + const channelCount = reader.uint32(); + const attachmentCount = reader.uint32(); + const metadataCount = reader.uint32(); + const chunkCount = reader.uint32(); + const messageStartTime = reader.uint64(); + const messageEndTime = reader.uint64(); + const channelMessageCounts = reader.map( + (r) => r.uint16(), + (r) => r.uint64(), + ); + reader.offset = startOffset + recordLength; + + return { + type: "Statistics", + messageCount, + schemaCount, + channelCount, + attachmentCount, + metadataCount, + chunkCount, + messageStartTime, + messageEndTime, + channelMessageCounts, + }; +} + +function parseMetadata(reader: Reader, recordLength: number): TypedMcapRecord { + const startOffset = reader.offset; + const name = reader.string(); + const metadata = reader.map( + (r) => r.string(), + (r) => r.string(), + ); + reader.offset = startOffset + recordLength; + return { type: "Metadata", metadata, name }; +} + +function parseMetadataIndex(reader: Reader, recordLength: number): TypedMcapRecord { + const startOffset = reader.offset; + const offset = reader.uint64(); + const length = reader.uint64(); + const name = reader.string(); + reader.offset = startOffset + recordLength; + + return { + type: "MetadataIndex", + offset, + length, + name, + }; +} + +function parseSummaryOffset(reader: Reader, recordLength: number): TypedMcapRecord { + const startOffset = reader.offset; + const groupOpcode = reader.uint8(); + const groupStart = reader.uint64(); + const groupLength = reader.uint64(); + reader.offset = startOffset + recordLength; + + return { + type: "SummaryOffset", + groupOpcode, + groupStart, + groupLength, + }; +} + +function parseDataEnd(reader: Reader, recordLength: number): TypedMcapRecord { + const startOffset = reader.offset; + const dataSectionCrc = reader.uint32(); + reader.offset = startOffset + recordLength; + return { + type: "DataEnd", + dataSectionCrc, + }; } From bc670471f429fc4c9eb00f9630bad0431045f564 Mon Sep 17 00:00:00 2001 From: Hans-Joachim Krauch Date: Tue, 24 Sep 2024 12:26:20 +0200 Subject: [PATCH 40/44] Bump @mcap/core version (#1240) ### Changelog Performance improvements (#1236) --- typescript/core/package.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/typescript/core/package.json b/typescript/core/package.json index e02ee9523a..07103c5809 100644 --- a/typescript/core/package.json +++ b/typescript/core/package.json @@ -1,6 +1,6 @@ { "name": "@mcap/core", - "version": "2.1.4", + "version": "2.1.5", "description": "MCAP file support in TypeScript", "license": "MIT", "repository": { From 671548149de029b5fc921e0915da9992a35cadaa Mon Sep 17 00:00:00 2001 From: Michael Orlov Date: Tue, 24 Sep 2024 14:50:14 -0700 Subject: [PATCH 41/44] cpp: Fix for undefined behavior in mcap::ParseByteArray(..) (#1239) ### Changelog Fix for possible undefined behavior in the cpp mcap reader. ### Docs None ### Description - We discovered undefined behavior in the `mcap::ParseByteArray(..)` when was running `Rosbag2` tests with `UBSAN` (Undefined Behavior Sanitizer). In particular tests from the https://github.com/ros2/rosbag2/blob/rolling/rosbag2_cpp/test/rosbag2_cpp/test_local_message_definition_source.cpp UBSAN was pointing ou to the `std::memcpy(output->data(), data + 4, size);` - According to the https://en.cppreference.com/w/cpp/string/byte/memcpy If either dest or src is an invalid or null pointer, the behavior is undefined, even if count is zero. After a comprehensive analysis, we found that we have a null pointer in dest, and the count is zero in std::memcpy(..) when the message definition was not found and was recorded with empty strings.
BeforeAfter
Undefined behavior when reading empty message definitions No undefined behavior when reading empty message definitions
--------- Signed-off-by: Michael Orlov --- cpp/mcap/include/mcap/internal.hpp | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/cpp/mcap/include/mcap/internal.hpp b/cpp/mcap/include/mcap/internal.hpp index 4faedd0b27..69b1dd9d43 100644 --- a/cpp/mcap/include/mcap/internal.hpp +++ b/cpp/mcap/include/mcap/internal.hpp @@ -138,7 +138,11 @@ inline Status ParseByteArray(const std::byte* data, uint64_t maxSize, ByteArray* return Status(StatusCode::InvalidRecord, msg); } output->resize(size); - std::memcpy(output->data(), data + 4, size); + // output->data() may return nullptr if 'output' is empty, but memcpy() does not accept nullptr. + // 'output' will be empty only if the 'size' is equal to 0. + if (size > 0) { + std::memcpy(output->data(), data + 4, size); + } return StatusCode::Success; } From ef36f4b795a4d3eeff1bb225feaaabe972022c7a Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Wed, 25 Sep 2024 08:13:10 -0700 Subject: [PATCH 42/44] build(deps): bump actions/checkout from 3 to 4 (#963) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Bumps [actions/checkout](https://github.com/actions/checkout) from 3 to 4.
Release notes

Sourced from actions/checkout's releases.

v4.0.0

What's Changed

New Contributors

Full Changelog: https://github.com/actions/checkout/compare/v3...v4.0.0

v3.6.0

What's Changed

New Contributors

Full Changelog: https://github.com/actions/checkout/compare/v3.5.3...v3.6.0

v3.5.3

What's Changed

New Contributors

Full Changelog: https://github.com/actions/checkout/compare/v3...v3.5.3

v3.5.2

What's Changed

Full Changelog: https://github.com/actions/checkout/compare/v3.5.1...v3.5.2

v3.5.1

What's Changed

New Contributors

... (truncated)

Changelog

Sourced from actions/checkout's changelog.

Changelog

v4.0.0

v3.6.0

v3.5.3

v3.5.2

v3.5.1

v3.5.0

v3.4.0

v3.3.0

v3.2.0

v3.1.0

v3.0.2

v3.0.1

... (truncated)

Commits

[![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=actions/checkout&package-manager=github_actions&previous-version=3&new-version=4)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores) You can trigger a rebase of this PR by commenting `@dependabot rebase`. [//]: # (dependabot-automerge-start) [//]: # (dependabot-automerge-end) ---
Dependabot commands and options
You can trigger Dependabot actions by commenting on this PR: - `@dependabot rebase` will rebase this PR - `@dependabot recreate` will recreate this PR, overwriting any edits that have been made to it - `@dependabot merge` will merge this PR after your CI passes on it - `@dependabot squash and merge` will squash and merge this PR after your CI passes on it - `@dependabot cancel merge` will cancel a previously requested merge and block automerging - `@dependabot reopen` will reopen this PR if it is closed - `@dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually - `@dependabot show ignore conditions` will show all of the ignore conditions of the specified dependency - `@dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself) - `@dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself) - `@dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)
> **Note** > Automatic rebases have been disabled on this pull request as it has been open for over 30 days. Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- .github/workflows/ci.yml | 36 +++++++++++++++++------------------ .github/workflows/website.yml | 10 +++++----- 2 files changed, 23 insertions(+), 23 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index e4c9ae282c..bb6d09b145 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -11,7 +11,7 @@ jobs: spellcheck: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 with: lfs: true - run: corepack enable @@ -26,7 +26,7 @@ jobs: conformance-lint: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 with: lfs: true - run: corepack enable @@ -41,7 +41,7 @@ jobs: conformance-cpp: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 with: lfs: true - run: corepack enable @@ -61,7 +61,7 @@ jobs: conformance-go: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 with: lfs: true - run: corepack enable @@ -80,7 +80,7 @@ jobs: conformance-python: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 with: lfs: true - run: corepack enable @@ -99,7 +99,7 @@ jobs: conformance-typescript: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 with: lfs: true - run: corepack enable @@ -114,7 +114,7 @@ jobs: conformance-kaitai-struct: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 with: lfs: true - run: corepack enable @@ -129,7 +129,7 @@ jobs: conformance-swift: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 with: lfs: true - run: corepack enable @@ -148,7 +148,7 @@ jobs: conformance-rust: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 with: lfs: true - run: corepack enable @@ -172,7 +172,7 @@ jobs: run: working-directory: cpp steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 with: lfs: true - uses: actions/cache@v4 @@ -191,7 +191,7 @@ jobs: run: working-directory: cpp steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 with: lfs: true - uses: actions/cache@v4 @@ -213,7 +213,7 @@ jobs: id-token: write steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 with: lfs: true - run: corepack enable @@ -270,7 +270,7 @@ jobs: typescript-examples: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 with: lfs: true - run: corepack enable @@ -300,7 +300,7 @@ jobs: permissions: id-token: write steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 with: lfs: true - uses: actions/setup-python@v5 @@ -381,7 +381,7 @@ jobs: run: working-directory: go steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 with: lfs: true - uses: actions/setup-go@v5 @@ -439,7 +439,7 @@ jobs: env: ${{ matrix.env }} steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 with: fetch-depth: "0" - run: git fetch --depth=1 origin +refs/tags/*:refs/tags/* @@ -466,7 +466,7 @@ jobs: swift: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 with: lfs: true - uses: swift-actions/setup-swift@v2 @@ -483,7 +483,7 @@ jobs: run: working-directory: rust steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 with: lfs: true - uses: actions-rs/toolchain@v1 diff --git a/.github/workflows/website.yml b/.github/workflows/website.yml index b8852a1e1e..97e4043abb 100644 --- a/.github/workflows/website.yml +++ b/.github/workflows/website.yml @@ -10,7 +10,7 @@ jobs: docs-home: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 with: lfs: true @@ -37,7 +37,7 @@ jobs: docs-cpp: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - run: make -C cpp ci-docs @@ -52,7 +52,7 @@ jobs: docs-python: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - uses: actions/setup-python@v5 with: @@ -73,7 +73,7 @@ jobs: docs-swift: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - uses: swift-actions/setup-swift@v2 with: @@ -112,7 +112,7 @@ jobs: deployments: write steps: # need checkout so that cloudflare can detect git commit - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - uses: actions/download-artifact@v4 From c0bc479d06751878d4763b8ce796cba94d973c72 Mon Sep 17 00:00:00 2001 From: kyle-basis Date: Wed, 25 Sep 2024 08:26:15 -0700 Subject: [PATCH 43/44] Don't clear channels on writer close() (#1194) Don't clear channels on writer close(). This matches the behavior schemas has, allowing for stable ids after closing and opening a new file with the same reader. See https://foxglove.slack.com/archives/C02H1JXG3C3/p1720827358364829?thread_ts=1720825408.044619&cid=C02H1JXG3C3 Alternatives to this involve more bookkeeping on the recorder implementation side to re-register all schemas/channels on split. --- cpp/mcap/include/mcap/writer.inl | 1 - 1 file changed, 1 deletion(-) diff --git a/cpp/mcap/include/mcap/writer.inl b/cpp/mcap/include/mcap/writer.inl index a99333aa2f..08465500a8 100644 --- a/cpp/mcap/include/mcap/writer.inl +++ b/cpp/mcap/include/mcap/writer.inl @@ -473,7 +473,6 @@ void McapWriter::terminate() { zstdChunk_.reset(); #endif - channels_.clear(); attachmentIndex_.clear(); metadataIndex_.clear(); chunkIndex_.clear(); From 0ebaf69efa2dd5dbe33a882ce51caa8451d518d2 Mon Sep 17 00:00:00 2001 From: Jacob Bandes-Storch Date: Wed, 25 Sep 2024 09:37:39 -0700 Subject: [PATCH 44/44] cpp: bump version to 1.4.1 (#1241) Bump version to 1.4.1. Changes include: - #1239 - #1199 --- cpp/bench/conanfile.py | 2 +- cpp/build-docs.sh | 2 +- cpp/build.sh | 2 +- cpp/docs/conanfile.py | 2 +- cpp/examples/conanfile.py | 2 +- cpp/mcap/conanfile.py | 2 +- cpp/mcap/include/mcap/types.hpp | 2 +- cpp/test/conanfile.py | 2 +- 8 files changed, 8 insertions(+), 8 deletions(-) diff --git a/cpp/bench/conanfile.py b/cpp/bench/conanfile.py index 57c7862ee2..ffc1efff3f 100644 --- a/cpp/bench/conanfile.py +++ b/cpp/bench/conanfile.py @@ -4,7 +4,7 @@ class McapBenchmarksConan(ConanFile): settings = "os", "compiler", "build_type", "arch" generators = "cmake" - requires = "benchmark/1.7.0", "mcap/1.4.0" + requires = "benchmark/1.7.0", "mcap/1.4.1" def build(self): cmake = CMake(self) diff --git a/cpp/build-docs.sh b/cpp/build-docs.sh index b93d09141b..f2e94fc348 100755 --- a/cpp/build-docs.sh +++ b/cpp/build-docs.sh @@ -4,7 +4,7 @@ set -e conan config init -conan editable add ./mcap mcap/1.4.0 +conan editable add ./mcap mcap/1.4.1 conan install docs --install-folder docs/build/Release \ -s compiler.cppstd=17 -s build_type=Release --build missing diff --git a/cpp/build.sh b/cpp/build.sh index 77ec913e08..9831bd1e1b 100755 --- a/cpp/build.sh +++ b/cpp/build.sh @@ -4,7 +4,7 @@ set -e conan config init -conan editable add ./mcap mcap/1.4.0 +conan editable add ./mcap mcap/1.4.1 conan install test --install-folder test/build/Debug \ -s compiler.cppstd=17 -s build_type=Debug --build missing diff --git a/cpp/docs/conanfile.py b/cpp/docs/conanfile.py index 8545b8e193..d1645bcff8 100644 --- a/cpp/docs/conanfile.py +++ b/cpp/docs/conanfile.py @@ -4,7 +4,7 @@ class McapDocsConan(ConanFile): settings = "os", "compiler", "build_type", "arch" generators = "cmake" - requires = "mcap/1.4.0" + requires = "mcap/1.4.1" def build(self): cmake = CMake(self) diff --git a/cpp/examples/conanfile.py b/cpp/examples/conanfile.py index 0e8845d226..b3f682916d 100644 --- a/cpp/examples/conanfile.py +++ b/cpp/examples/conanfile.py @@ -5,7 +5,7 @@ class McapExamplesConan(ConanFile): settings = "os", "compiler", "build_type", "arch" generators = "cmake" requires = [ - "mcap/1.4.0", + "mcap/1.4.1", "protobuf/3.21.1", "nlohmann_json/3.10.5", "catch2/2.13.8", diff --git a/cpp/mcap/conanfile.py b/cpp/mcap/conanfile.py index dc9bc27158..0c9d7b8278 100644 --- a/cpp/mcap/conanfile.py +++ b/cpp/mcap/conanfile.py @@ -3,7 +3,7 @@ class McapConan(ConanFile): name = "mcap" - version = "1.4.0" + version = "1.4.1" url = "https://github.com/foxglove/mcap" homepage = "https://github.com/foxglove/mcap" description = "A C++ implementation of the MCAP file format" diff --git a/cpp/mcap/include/mcap/types.hpp b/cpp/mcap/include/mcap/types.hpp index 555a020ace..16059dff6d 100644 --- a/cpp/mcap/include/mcap/types.hpp +++ b/cpp/mcap/include/mcap/types.hpp @@ -13,7 +13,7 @@ namespace mcap { -#define MCAP_LIBRARY_VERSION "1.4.0" +#define MCAP_LIBRARY_VERSION "1.4.1" using SchemaId = uint16_t; using ChannelId = uint16_t; diff --git a/cpp/test/conanfile.py b/cpp/test/conanfile.py index 7cddbc2707..683014746a 100644 --- a/cpp/test/conanfile.py +++ b/cpp/test/conanfile.py @@ -4,7 +4,7 @@ class McapTestConan(ConanFile): settings = "os", "compiler", "build_type", "arch" generators = "cmake" - requires = "catch2/2.13.8", "mcap/1.4.0", "nlohmann_json/3.10.5" + requires = "catch2/2.13.8", "mcap/1.4.1", "nlohmann_json/3.10.5" def build(self): cmake = CMake(self)