Skip to content

Commit

Permalink
Add support for more compression algorithms.
Browse files Browse the repository at this point in the history
Now can read/write gz, bz2, zst, xz & lz4
  • Loading branch information
kawas44 authored and edporras committed Jul 1, 2024
1 parent 3406b50 commit a45ff80
Show file tree
Hide file tree
Showing 14 changed files with 471 additions and 63 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/clojure.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ jobs:
- name: Install dependencies
run: lein -U deps
- name: Run tests
run: lein test
run: lein test-all
- name: Deploy
if: github.ref == 'refs/heads/master' || github.ref == 'refs/heads/devel'
run: lein deploy
Expand Down
25 changes: 13 additions & 12 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -27,8 +27,8 @@ Oscaro’s generic I/O tools collection.
- [spit](#spit)
- [exists?](#exists)
- [core](#core)
- [gzipped?](#gzipped)
- [file-reader](#file-reader)
- [About compression](#about-compression)
- [clj-kondo](#clj-kondo)
- [License](#license)

Expand Down Expand Up @@ -290,20 +290,21 @@ you need to call (close! file) when you done.

**returns**: an map with a `:stream` key

#### `gzipped?`
## About compression

Test if a filename ends with `.gz` or `.gzip`
By default, `tools.io` supports _[gzip]_, _[bzip2]_ and _[framed lz4][]_
compression algorithms and can be extended by implementing a custom protocol
(see sources).

**arguments**:
- filename

**returns**: a boolean
It also supports the following formats if you provide the required dependencies.
- _[xz]_ with `org.tukaani/xz` provided
- _[zstd]_ with `com.github.luben/zstd-jni` provided

examples
```clojure
(core/gzipped? "toto.gz"); => true
(core/gzipped? "toto.GZip"); => true
```
[gzip]: https://en.wikipedia.org/wiki/Gzip
[bzip2]: https://en.wikipedia.org/wiki/Bzip2
[xz]: https://en.wikipedia.org/wiki/XZ_Utils
[zstd]: https://en.wikipedia.org/wiki/Zstd
[framed lz4]: https://en.wikipedia.org/wiki/LZ4_(compression_algorithm)

## clj-kondo

Expand Down
10 changes: 9 additions & 1 deletion project.clj
Original file line number Diff line number Diff line change
Expand Up @@ -29,5 +29,13 @@
:profiles {:dev {:global-vars {*warn-on-reflection* true}
:source-paths ["dev"]
:resource-paths ["test/resources"]
:dependencies [[org.clojure/tools.namespace "1.5.0"]]}}
:dependencies [[org.clojure/tools.namespace "1.5.0"]]}
:extra-compression
{:dependencies [[org.tukaani/xz "1.9"]
[com.github.luben/zstd-jni "1.5.6-3"]]}}
:test-selectors {:default (fn [m] (not (:extra-compression m)))
:extra-compression :extra-compression}
:aliases {"repl" ["with-profile" "+extra-compression" "repl"]
"test-all" ["with-profile" "+extra-compression" "test" ":all"]}
:target-path "target/%s/"
:repl-options {:init-ns user})
38 changes: 19 additions & 19 deletions src/tools/io.clj
Original file line number Diff line number Diff line change
Expand Up @@ -229,54 +229,54 @@

(def ^{:added "0.3.16"
:doc
"Return a lazy seq of string from a [protocol://]text[.gz] file.
warning: the seq must be entirely consumed before the file is closed."}
"Returns a lazy seq of string from a [protocol://]text[.zext] file.
Warning: The seq must be entirely consumed before the file is closed."}
read-text-file
(read-string-format-file-fn identity))

(def ^{:added "0.3.16"
:doc
"Write a seq of strings in a [protocol://]text[.gz] file."}
"Writes a seq of strings in a [protocol://]text[.zext] file."}
write-text-file
(write-string-file-fn identity))

(def ^{:added "0.3.16"
:doc
"Return a lazy seq of parsed json objects from a [protocol://]jsons[.gz] file.
warning: the seq must be entirely consumed before the file is closed."}
"Returns a lazy seq of parsed json objects from a [protocol://]jsons[.zext] file.
Warning: The seq must be entirely consumed before the file is closed."}
read-jsons-file
(read-string-format-file-fn #(charred/read-json % :key-fn keyword)))

(def ^{:added "0.3.16"
:doc
"Write a seq of elements serialized as JSON in a [protocol://]jsons[.gz] file."}
"Writes a seq of elements serialized as JSON in a [protocol://]jsons[.zext] file."}
write-jsons-file
(write-string-file-fn #(charred/write-json-str % :indent-str nil :escape-slash false)))

(def ^{:added "0.3.16"
:doc
"Return a lazy seq of parsed edn objects from a [protocol://]edn[.gz] file.
warning: the seq must be entirely consumed before the file is closed."}
"Returns a lazy seq of parsed edn objects from a [protocol://]edn[.zext] file.
Warning: The seq must be entirely consumed before the file is closed."}
read-edns-file
(read-string-format-file-fn edn/read-string))

(def ^{:added "0.3.16"
:doc
"Write a seq of elements serialized as EDN in a [protocol://]edn[.gz] file."}
"Writes a seq of elements serialized as EDN in a [protocol://]edn[.zext] file."}
write-edns-file
(write-string-file-fn prn-str))

(defn write-edn-file
"Write an element serialized as EDN in a [protocol://]edn[.gz] file.
"Writes an element serialized as EDN in a [protocol://]edn[.zext] file.
This is equivalent to call write-edns-file on a one-element sequence."
([filename x] (write-edn-file filename {} x))
([filename options x] (write-edns-file filename options [x])))

(defn ^{:added "0.3.16"
:doc
"Return a lazy seq of parsed csv row as vector from a [protocol://]csv[.gz] file.
see http://clojure.github.io/data.csv/ for options
warning: the seq must be entirely consumed before the file is closed.
"Returns a lazy seq of parsed csv row as vector from a [protocol://]csv[.zext] file.
See http://clojure.github.io/data.csv/ for options
Warning: The seq must be entirely consumed before the file is closed.
sample usage:
(read-csv-file \"infos_tarifs.csv\" {:encoding \"ISO-8859-1\"} :separator \\;)"}
Expand All @@ -297,8 +297,8 @@

(defn ^{:added "0.3.16"
:doc
"Write a seq of vectors serialized as CSV in a [protocol://]csv[.gz] file.
see http://clojure.github.io/data.csv/ for options.
"Writes a seq of vectors serialized as CSV in a [protocol://]csv[.zext] file.
See http://clojure.github.io/data.csv/ for options.
(write-csv-file out my-lines)
(write-csv-file out [stream-options-map] my-lines [csv options...])
Expand All @@ -325,15 +325,15 @@

(def ^{:added "0.3.16"
:doc
"Return a lazy seq of parsed json objects from [protocol://]jsons[.gz] files.
warning: the seq must be entirely consumed before every files are closed."}
"Returns a lazy seq of parsed json objects from [protocol://]jsons[.zext] files.
Warning: The seq must be entirely consumed before every files are closed."}
read-jsons-files
(read-string-files-fn read-jsons-file))

(def ^{:added "0.3.16"
:doc
"Return a lazy seq of parsed edn objects from [protocol://]edns[.gz] files.
warning: the seq must be entirely consumed before every files are closed."}
"Returns a lazy seq of parsed edn objects from [protocol://]edns[.zext] files.
Warning: The seq must be entirely consumed before every files are closed."}
read-edns-files
(read-string-files-fn read-edns-file))

Expand Down
228 changes: 228 additions & 0 deletions src/tools/io/compress.clj
Original file line number Diff line number Diff line change
@@ -0,0 +1,228 @@
(ns tools.io.compress
(:require
[clojure.string :as str])
(:import
(java.io InputStream OutputStream)
(org.apache.commons.compress.compressors CompressorException CompressorStreamFactory)
(org.apache.commons.compress.compressors.bzip2 BZip2CompressorInputStream BZip2CompressorOutputStream)
(org.apache.commons.compress.compressors.gzip GzipCompressorInputStream GzipCompressorOutputStream GzipParameters)
(org.apache.commons.compress.compressors.lz4 FramedLZ4CompressorInputStream FramedLZ4CompressorOutputStream FramedLZ4CompressorOutputStream$BlockSize FramedLZ4CompressorOutputStream$Parameters)
(org.apache.commons.compress.compressors.xz XZUtils)
(org.apache.commons.compress.compressors.zstandard ZstdUtils)))

(defn ^:no-doc detect?
"Do not expose!
Useful for internal use but depends too much on Common Compress."
[input-stream]
(try
(CompressorStreamFactory/detect input-stream)
(catch CompressorException e
(println (ex-message e)))))

(defprotocol Compressor
"Interface of a Compressor implementation usable with `tools.io`."

(-get-file-extensions
[_this]
"Returns a collection of related file extensions.")
(-get-input-stream ^InputStream
[_this input-stream options]
"Returns a stream which uncompress an input stream.")
(-get-output-stream ^OutputStream
[_this output-stream options]
"Returns a stream which compress an output stream."))

(defonce ^:private ^:no-doc !ext->compressor (atom {}))

(defn register-compressor!
"Registers a new compressor implementation in the global registry."
[compressor]
(if (satisfies? Compressor compressor)
(let [exts (-get-file-extensions compressor)]
(doseq [ext (mapv str/lower-case exts)]
(swap! !ext->compressor assoc ext compressor)))
(throw (ex-info (str "Invalid compressor object:" compressor) {}))))

(defn unregister-compressor!
"Unregisters a compressor from the global registry."
[compressor]
(if (satisfies? Compressor compressor)
(let [exts (-get-file-extensions compressor)]
(doseq [ext (mapv str/lower-case exts)]
(swap! !ext->compressor dissoc ext)))
(throw (ex-info (str "Invalid compressor object:" compressor) {}))))

(defn get-compressor
"Returns the compressor associated with a given file extension or nil."
[file-extension]
(when (string? file-extension)
(get @!ext->compressor (str/lower-case file-extension))))

;;
;; Gzip support
;;

(defn- gzip-opts
[{:keys [compression-level buffer-size]}]
(let [params (GzipParameters.)]
(when compression-level
(.setCompressionLevel params (int compression-level)))
(when buffer-size
(.setBufferSize params (int buffer-size)))
params))

(defrecord GzipCompressor []
Compressor
(-get-file-extensions
[_]
["gz" "gzip"])
(-get-input-stream
[_ input-stream {:keys [concatenated?]}]
(GzipCompressorInputStream. input-stream (boolean concatenated?)))
(-get-output-stream
[_ output-stream opts]
(GzipCompressorOutputStream. output-stream (gzip-opts opts))))

;;
;; Bzip2 support
;;

(defrecord Bzip2Compressor []
Compressor
(-get-file-extensions
[_]
["bz2" "bzip2"])
(-get-input-stream
[_ input-stream {:keys [concatenated?]}]
(BZip2CompressorInputStream. input-stream (boolean concatenated?)))
(-get-output-stream
[_ output-stream {:keys [block-size]}]
(if block-size
(BZip2CompressorOutputStream. output-stream (int block-size))
(BZip2CompressorOutputStream. output-stream))))

;;
;; Framed LZ4 support
;;

(defn- ->lz4-block-size
[s]
(FramedLZ4CompressorOutputStream$BlockSize/valueOf (name s)))

(defn- lz4-opts
[{:keys [block-size]}]
(if block-size
(FramedLZ4CompressorOutputStream$Parameters. (->lz4-block-size block-size))
FramedLZ4CompressorOutputStream$Parameters/DEFAULT))

(defrecord FramedLZ4Compressor []
Compressor
(-get-file-extensions
[_]
["lz4"])
(-get-input-stream
[_ input-stream {:keys [concatenated?]}]
(FramedLZ4CompressorInputStream. input-stream (boolean concatenated?)))
(-get-output-stream
[_ output-stream opts]
(FramedLZ4CompressorOutputStream. output-stream (lz4-opts opts))))

;;
;; Extra provided compression
;;

(defn- ex-compression
[compression]
(let [msg (format "%s compression is not available"
(str/upper-case compression))]
(ex-info msg {:error msg :missing-compression compression})))

;;
;;
;; Zstandard support
;;

(defmacro ^:private when-zstd-provided
[& body]
(if (ZstdUtils/isZstdCompressionAvailable)
`(do ~@body)
`(throw (ex-compression "Zstd"))))

(defmacro ^:private zstd-new
[stream-name & args]
`(new ~(symbol (str "org.apache.commons.compress.compressors.zstandard."
(name stream-name)))
~@args))

(defrecord ZstdCompressor []
Compressor
(-get-file-extensions
[_]
["zst" "zstd"])
(-get-input-stream
[_ input-stream _opts]
(when-zstd-provided
(zstd-new "ZstdCompressorInputStream" input-stream)))
(-get-output-stream
[_ output-stream {:keys [level]}]
(when-zstd-provided
(if level
(zstd-new "ZstdCompressorOutputStream" output-stream (int level))
(zstd-new "ZstdCompressorOutputStream" output-stream)))))

;;
;; XZ support
;;

(defmacro ^:private when-xz-provided
[& body]
(if (XZUtils/isXZCompressionAvailable)
`(do ~@body)
`(throw (ex-compression "XZ"))))

(defmacro ^:private xz-new
[stream-name & args]
`(new ~(symbol (str "org.apache.commons.compress.compressors.xz."
(name stream-name)))
~@args))

(defrecord XZCompressor []
Compressor
(-get-file-extensions
[_]
["xz"])
(-get-input-stream
[_ input-stream {:keys [concatenated?]}]
(when-xz-provided
(xz-new "XZCompressorInputStream" input-stream (boolean concatenated?))))
(-get-output-stream
[_ output-stream {:keys [preset]}]
(when-xz-provided
(if preset
(xz-new "XZCompressorOutputStream" output-stream (int preset))
(xz-new "XZCompressorOutputStream" output-stream)))))

;;
;; Register default compressors
;;

(def ^:private default-compressors
#{(->GzipCompressor)
(->Bzip2Compressor)
(->ZstdCompressor)
(->XZCompressor)
(->FramedLZ4Compressor)})

(defn register-default-compressors!
"Register all default compressors."
[]
(doseq [compressor default-compressors]
(register-compressor! compressor)))

(defn unregister-default-compressors!
"Unregister all default compressors."
[]
(doseq [compressor default-compressors]
(unregister-compressor! compressor)))

(register-default-compressors!)
Loading

0 comments on commit a45ff80

Please sign in to comment.