make StreamingWARCSerializer the default WARCSerializer!

rename StreamingWARCSerializer -> WARCSerializer rename old WARCSerializer -> FullRecordWARCSerializer deprecate FullRecordWARCSerializer update tests to also test FullRecordWARCSerializer rename bufferRecord -> digestRecord add WARCSerializerTempBuffer to tempfilebuffer.ts, include in node builds README: mention new streaming support in WARCSerializer, add node examples
webrecorder · Aug 3, 2023 · 48923dd · 48923dd
1 parent 272f3ac
commit 48923dd
Show file tree

Hide file tree

Showing 16 changed files with 219 additions and 101 deletions.
diff --git a/README.md b/README.md
@@ -308,13 +308,16 @@ For example, the following snippet demonstrates a writer that logs all HTML file
 
 ## Writing WARC Files
 
-WARCs can be created using `WARCRecord.create()` static method, and serialized using `WARCSerializer.serialize()`
+WARCs can be created using `WARCRecord.create()` static method, and serialized using the `WARCSerializer`.
 
-Passing in `WARCSerializer.serialize({gzip: true})` will create a GZIP compressed record.
+When serializing, the `WARC-Payload-Digest`, `WARC-Block-Digest` and `Content-Length` headers are automatically computed
+to ensure correct values, overriding those provided in `warcHeaders`.
 
-The payload can be provided as an async iterator. The `WARC-Payload-Digest` and `WARC-Block-Digest` are automatically computed if not provided in the `warcHeaders`.
+Setting `gzip: true` in `opts` will serialize to GZIP-compressed records.
 
-(Note that at this time, computing the digest requires buffering the payload fully, due to limitation of `crypto.subtle.digest()` apis in requiring a full buffer).
+Calling `WARCSerializer.serialize(opts)` will serialize the entire WARC record into a single array buffer.
+
+This is the simplest way to serialize WARC records and works well for smaller files
 
 <details>
   <summary>An example of generating WARCs in the browser:</summary>
@@ -425,6 +428,94 @@ main();
 
 </details>
 
+## Writing Larger WARC Records
+
+For larger WARC records, it is not ideal to buffer the entire WARC payload into memory.
+
+Starting with 2.2.0, warcio.js supports fully streaming serialization.
+To compute the digests, the data needs to be read twice, once to compute the digest and again to be written to the WARC.
+To support this, warcio.js uses `hash-wasm` for incremental digest computation and supports an external buffer which can
+write and read the data at a later time.
+
+For the Node version, a temporary file-based serializer is provided via `WARCSerializerTempBuffer`.
+
+For browser-based usage, the data is still
+buffered in memory but customized solutions can be implemented.
+
+<details>
+  <summary>Example of generating larger WARC records in Node using WARCSerializerTempBuffer</summary>
+
+```javascript
+import { WARCRecord } from "warcio";
+import { WARCSerializerTempBuffer } from "warcio/tempfilebuffer";
+
+async function main() {
+  const url = "https://example.com/some/large/file";
+
+  const resp = await fetch(url);
+
+  const record = await WARCRecord.create({type: "response", url}, resp.body);
+
+  // store up to 16K in memory, buffer the rest to temporary file
+  const serializer = new WARCSerializerTempBuffer(record, {gzip: true, maxMemSize: 16384});
+
+  for await (const chunk of serializer) {
+    // process WARC record chunks incrementally
+    console.log(chunk);
+  }
+}
+
+main();
+```
+
+</details>
+
+Using standard Node fs functions, it is possible to easily stream content via `fetch()` directly to
+WARC records:
+
+<details>
+  <summary>Fetching and streaming content to multiple WARC records on disk using WARCSerializerTempBuffer</summary>
+
+```javascript
+import { WARCRecord } from "warcio";
+import { WARCSerializerTempBuffer } from "warcio/tempfilebuffer";
+
+import fs from "node:fs";
+import { pipeline } from "node:stream/promises";
+import { Readable } from "node:stream";
+
+async function fetchAndWrite(url, warcOutputStream) {
+  const resp = await fetch(url);
+
+  const record = await WARCRecord.create({type: "response", url}, resp.body);
+
+  const serializer = new WARCSerializerTempBuffer(record, {gzip: true});
+
+  await pipeline(Readable.from(serializer), warcOutputStream, {end: false});
+}
+
+async function main() {
+  const outputFile = fs.createWriteStream("test.warc.gz");
+
+  await fetchAndWrite("https://example.com/some/large/file1.bin", outputFile);
+
+  await fetchAndWrite("https://example.com/another/large/file2", outputFile);
+
+  outputFile.close();
+}
+
+main();
+```
+
+</details>
+
+
+
+
+
+
+
+
 ## Not Yet Implemented
 
 This library is still new and some functionality is 'not yet implemented' when compared to python `warcio` including:

diff --git a/dist/cli.cjs b/dist/cli.cjs
diff --git a/dist/cli.js b/dist/cli.js
diff --git a/dist/index.all.js b/dist/index.all.js
diff --git a/dist/index.all.js.map b/dist/index.all.js.map
diff --git a/dist/index.cjs b/dist/index.cjs
diff --git a/dist/index.d.ts b/dist/index.d.ts
@@ -232,25 +232,25 @@ declare abstract class BaseWARCSerializer extends BaseAsyncIterReader {
     [Symbol.asyncIterator](): AsyncGenerator<any, void, unknown>;
     readlineRaw(maxLength?: number): Promise<Uint8Array | null>;
     pakoCompress(): AsyncGenerator<any, void, unknown>;
-    streamCompress(cs: CompressionStream): AsyncGenerator<any, void, unknown>;
+    streamCompress(cs: CompressionStream): AsyncGenerator<Uint8Array, void, unknown>;
 }
-declare class WARCSerializer extends BaseWARCSerializer {
+declare class FullRecordWARCSerializer extends BaseWARCSerializer {
     static serialize(record: WARCRecord, opts?: WARCSerializerOpts): Promise<Uint8Array>;
     static base16(hashBuffer: ArrayBuffer): string;
     digestMessage(chunk: BufferSource): Promise<string>;
     generateRecord(): AsyncGenerator<Uint8Array>;
 }
-declare class StreamingWARCSerializer extends BaseWARCSerializer {
+declare class WARCSerializer extends BaseWARCSerializer {
     externalBuffer: StreamingBufferIO;
-    buffered: boolean;
     blockHasher: IHasher | null;
     payloadHasher: IHasher | null;
     httpHeadersBuff: Uint8Array | null;
     warcHeadersBuff: Uint8Array | null;
+    static serialize(record: WARCRecord, opts?: WARCSerializerOpts, externalBuffer?: StreamingBufferIO): Promise<Uint8Array>;
     constructor(record: WARCRecord, opts?: WARCSerializerOpts, externalBuffer?: StreamingBufferIO);
     newHasher(): Promise<IHasher> | null;
     getDigest(hasher: IHasher): string;
-    bufferRecord(): Promise<void>;
+    digestRecord(): Promise<void>;
     generateRecord(): AsyncGenerator<Uint8Array, void, unknown>;
 }
 
@@ -309,4 +309,4 @@ declare function mfdToQueryString(mfd: string | Uint8Array, contentType: string)
 declare function concatChunks(chunks: Uint8Array[], size: number): Uint8Array;
 declare function splitChunk(chunk: Uint8Array, inx: number): [Uint8Array, Uint8Array];
 
-export { AsyncIterReader, AsyncIterReaderOpts, BaseAsyncIterReader, CDXAndRecordIndexer, CDXIndexer, Indexer, LimitReader, NoConcatInflator, Request, Source, SourceReadable, SourceReader, StatusAndHeaders, StatusAndHeadersParser, StreamResult, StreamResults, StreamingBufferIO, StreamingWARCSerializer, WARCParser, WARCParserOpts, WARCRecord, WARCRecordOpts, WARCSerializer, WARCSerializerOpts, WARCType, WARC_1_0, WARC_1_1, appendRequestQuery, concatChunks, getSurt, jsonToQueryParams, jsonToQueryString, mfdToQueryParams, mfdToQueryString, postToGetUrl, splitChunk };
+export { AsyncIterReader, AsyncIterReaderOpts, BaseAsyncIterReader, CDXAndRecordIndexer, CDXIndexer, FullRecordWARCSerializer, Indexer, LimitReader, NoConcatInflator, Request, Source, SourceReadable, SourceReader, StatusAndHeaders, StatusAndHeadersParser, StreamResult, StreamResults, StreamingBufferIO, WARCParser, WARCParserOpts, WARCRecord, WARCRecordOpts, WARCSerializer, WARCSerializerOpts, WARCType, WARC_1_0, WARC_1_1, appendRequestQuery, concatChunks, getSurt, jsonToQueryParams, jsonToQueryString, mfdToQueryParams, mfdToQueryString, postToGetUrl, splitChunk };