From e626c1137273b3a293f0549474ee3cd47a1de254 Mon Sep 17 00:00:00 2001 From: Jordan Doyle Date: Sun, 29 Sep 2024 22:11:42 +0400 Subject: [PATCH] Use Helix's tree-sitter grammar registry --- Cargo.lock | 352 +++------------ Cargo.toml | 29 +- flake.lock | 17 + flake.nix | 18 +- src/git.rs | 52 +-- src/main.rs | 5 + src/syntax_highlight.rs | 251 +++++------ tree-sitter-grammar-repository/Cargo.toml | 26 ++ tree-sitter-grammar-repository/build.rs | 498 ++++++++++++++++++++++ tree-sitter-grammar-repository/src/lib.rs | 66 +++ 10 files changed, 824 insertions(+), 490 deletions(-) create mode 100644 tree-sitter-grammar-repository/Cargo.toml create mode 100644 tree-sitter-grammar-repository/build.rs create mode 100644 tree-sitter-grammar-repository/src/lib.rs diff --git a/Cargo.lock b/Cargo.lock index fefe4df..a1da4ba 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -381,7 +381,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "40723b8fb387abc38f4f4a37c09073622e41dd12327033091ef8950659e6dc0c" dependencies = [ "memchr", - "regex-automata 0.4.7", + "regex-automata 0.4.8", "serde", ] @@ -1707,6 +1707,19 @@ version = "0.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d2fabcfbdc87f4758337ca535fb41a6d701b65693ce38287d856d1674551ec9b" +[[package]] +name = "globset" +version = "0.4.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "15f1ce686646e7f1e19bf7d5533fe443a45dbfb990e00629110797578b42fb19" +dependencies = [ + "aho-corasick", + "bstr", + "log", + "regex-automata 0.4.8", + "regex-syntax 0.8.5", +] + [[package]] name = "h2" version = "0.4.6" @@ -2301,6 +2314,16 @@ dependencies = [ "libm", ] +[[package]] +name = "num_cpus" +version = "1.16.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4161fcb6d602d4d2081af7c3a45852d875a03dd337a6bfdd6e06407b61342a43" +dependencies = [ + "hermit-abi", + "libc", +] + [[package]] name = "object" version = "0.36.4" @@ -2461,6 +2484,16 @@ dependencies = [ "zerocopy", ] +[[package]] +name = "prettyplease" +version = "0.2.22" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "479cf940fbbb3426c32c5d5176f62ad57549a0bb84773423ba8be9d089f5faba" +dependencies = [ + "proc-macro2", + "syn", +] + [[package]] name = "proc-macro2" version = "1.0.86" @@ -2488,7 +2521,7 @@ dependencies = [ "rand", "rand_chacha", "rand_xorshift", - "regex-syntax 0.8.4", + "regex-syntax 0.8.5", "unarray", ] @@ -2616,14 +2649,14 @@ dependencies = [ [[package]] name = "regex" -version = "1.10.6" +version = "1.11.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4219d74c6b67a3654a9fbebc4b419e22126d13d2f3c4a07ee0cb61ff79a79619" +checksum = "38200e5ee88914975b69f657f0801b6f6dccafd44fd9326302a4aaeecfacb1d8" dependencies = [ "aho-corasick", "memchr", - "regex-automata 0.4.7", - "regex-syntax 0.8.4", + "regex-automata 0.4.8", + "regex-syntax 0.8.5", ] [[package]] @@ -2637,13 +2670,13 @@ dependencies = [ [[package]] name = "regex-automata" -version = "0.4.7" +version = "0.4.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "38caf58cc5ef2fed281f89292ef23f6365465ed9a41b7a7754eb4e26496c92df" +checksum = "368758f23274712b504848e9d5a6f010445cc8b87a7cdb4d7cbee666c1288da3" dependencies = [ "aho-corasick", "memchr", - "regex-syntax 0.8.4", + "regex-syntax 0.8.5", ] [[package]] @@ -2654,9 +2687,9 @@ checksum = "f162c6dd7b008981e4d40210aca20b4bd0f9b60ca9271061b07f78537722f2e1" [[package]] name = "regex-syntax" -version = "0.8.4" +version = "0.8.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7a66a03ae7c801facd77a29370b4faec201768915ac14a721ba36f20bc9c209b" +checksum = "2b15c43186be67a4fd63bee50d0303afffcef381492ebe2c5d87f324e1b8815c" [[package]] name = "rgit" @@ -2702,32 +2735,8 @@ dependencies = [ "tower-service", "tracing", "tracing-subscriber", - "tree-sitter-bash", - "tree-sitter-c", - "tree-sitter-c-sharp", - "tree-sitter-cpp", - "tree-sitter-css", - "tree-sitter-elixir", - "tree-sitter-fortran", - "tree-sitter-go", - "tree-sitter-haskell", + "tree-sitter-grammar-repository", "tree-sitter-highlight", - "tree-sitter-html", - "tree-sitter-java", - "tree-sitter-javascript", - "tree-sitter-json", - "tree-sitter-md", - "tree-sitter-ocaml", - "tree-sitter-php", - "tree-sitter-python", - "tree-sitter-regex", - "tree-sitter-ruby", - "tree-sitter-rust", - "tree-sitter-scss", - "tree-sitter-svelte-ng", - "tree-sitter-toml-ng", - "tree-sitter-typescript", - "tree-sitter-yaml", "unix_mode", "uuid", "v_htmlescape", @@ -3067,7 +3076,7 @@ dependencies = [ "once_cell", "onig", "plist", - "regex-syntax 0.8.4", + "regex-syntax 0.8.5", "serde", "serde_derive", "serde_json", @@ -3146,6 +3155,15 @@ dependencies = [ "once_cell", ] +[[package]] +name = "threadpool" +version = "1.8.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d050e60b33d41c19108b32cea32164033a9013fe3b46cbd4457559bfbf77afaa" +dependencies = [ + "num_cpus", +] + [[package]] name = "time" version = "0.3.36" @@ -3460,97 +3478,27 @@ checksum = "20f4cd3642c47a85052a887d86704f4eac272969f61b686bdd3f772122aabaff" dependencies = [ "cc", "regex", - "regex-syntax 0.8.4", - "tree-sitter-language", -] - -[[package]] -name = "tree-sitter-bash" -version = "0.23.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3aa5e1c6bd02c0053f3f68edcf5d8866b38a8640584279e30fca88149ce14dda" -dependencies = [ - "cc", - "tree-sitter-language", -] - -[[package]] -name = "tree-sitter-c" -version = "0.23.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c8b3fb515e498e258799a31d78e6603767cd6892770d9e2290ec00af5c3ad80b" -dependencies = [ - "cc", - "tree-sitter-language", -] - -[[package]] -name = "tree-sitter-c-sharp" -version = "0.23.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "04c0f6d2209a3cd6d0bb9d2934715da15a15710d3c09c7c1ecd4c9804c3ecd10" -dependencies = [ - "cc", - "tree-sitter-language", -] - -[[package]] -name = "tree-sitter-cpp" -version = "0.23.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1d67e862242878d6ee50e1e5814f267ee3eea0168aea2cdbd700ccfb4c74b6d3" -dependencies = [ - "cc", - "tree-sitter-language", -] - -[[package]] -name = "tree-sitter-css" -version = "0.23.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8d0018d6b1692a806f9cddaa1e5616951fd58840c39a0b21401b55ab3df12292" -dependencies = [ - "cc", - "tree-sitter-language", -] - -[[package]] -name = "tree-sitter-elixir" -version = "0.3.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "97bf0efa4be41120018f23305b105ad4dfd3be1b7f302dc4071d0e6c2dec3a32" -dependencies = [ - "cc", - "tree-sitter-language", -] - -[[package]] -name = "tree-sitter-fortran" -version = "0.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "50d655214a848bfb63dfdc2e7eeef5c3c323807a220b3117a1aef46b2bb95a12" -dependencies = [ - "cc", - "tree-sitter-language", -] - -[[package]] -name = "tree-sitter-go" -version = "0.23.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "caf57626e4c9b6d6efaf8a8d5ee1241c5f178ae7bfdf693713ae6a774f01424e" -dependencies = [ - "cc", + "regex-syntax 0.8.5", "tree-sitter-language", ] [[package]] -name = "tree-sitter-haskell" -version = "0.23.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9b92c8a4c4ceaae105621b00624ee8d9029fb23116f400832e4be30d0639d054" +name = "tree-sitter-grammar-repository" +version = "0.0.1" dependencies = [ + "anyhow", "cc", + "globset", + "heck", + "prettyplease", + "proc-macro2", + "quote", + "regex", + "serde", + "serde_json", + "syn", + "threadpool", + "toml", "tree-sitter-language", ] @@ -3566,172 +3514,12 @@ dependencies = [ "tree-sitter", ] -[[package]] -name = "tree-sitter-html" -version = "0.23.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7d52d710a3723360ebade986d3f0ae2aa2c3bcfb87bb1cdf60988ec51c81c40d" -dependencies = [ - "cc", - "tree-sitter-language", -] - -[[package]] -name = "tree-sitter-java" -version = "0.23.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b38b26736e6e97421760201f7a91c859f3b0d44382d48ac18aa963828f784ebf" -dependencies = [ - "cc", - "tree-sitter-language", -] - -[[package]] -name = "tree-sitter-javascript" -version = "0.23.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "59e1f62f8babb640b909f30675d1addeb1f17802f2a4d2af287569753b243977" -dependencies = [ - "cc", - "tree-sitter-language", -] - -[[package]] -name = "tree-sitter-json" -version = "0.23.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "86a5d6b3ea17e06e7a34aabeadd68f5866c0d0f9359155d432095f8b751865e4" -dependencies = [ - "cc", - "tree-sitter-language", -] - [[package]] name = "tree-sitter-language" version = "0.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2545046bd1473dac6c626659cc2567c6c0ff302fc8b84a56c4243378276f7f57" -[[package]] -name = "tree-sitter-md" -version = "0.3.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "17f968c22a01010b83fc960455ae729db08dbeb6388617d9113897cb9204b030" -dependencies = [ - "cc", - "tree-sitter-language", -] - -[[package]] -name = "tree-sitter-ocaml" -version = "0.23.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0534f94f006cf4d4994e964212e91d4626efcaf6769b023d3f17530399a4d6e1" -dependencies = [ - "cc", - "tree-sitter-language", -] - -[[package]] -name = "tree-sitter-php" -version = "0.23.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4e0470ddcab3cab948615d50b0395da28e4ab886c0f78363e607cf7f0724cf4a" -dependencies = [ - "cc", - "tree-sitter-language", -] - -[[package]] -name = "tree-sitter-python" -version = "0.23.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "65661b1a3e24139e2e54207e47d910ab07e28790d78efc7d5dc3a11ce2a110eb" -dependencies = [ - "cc", - "tree-sitter-language", -] - -[[package]] -name = "tree-sitter-regex" -version = "0.23.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0b9a7087b1cf769c96b7e74414947df067fb6135f04d176fd23be08b9396cc0e" -dependencies = [ - "cc", - "tree-sitter-language", -] - -[[package]] -name = "tree-sitter-ruby" -version = "0.23.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6ec5ee842e27791e0adffa0b2a177614de51d2a26e5c7e84d014ed7f097e5ed0" -dependencies = [ - "cc", - "tree-sitter-language", -] - -[[package]] -name = "tree-sitter-rust" -version = "0.23.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cffbbcb780348fbae8395742ae5b34c1fd794e4085d43aac9f259387f9a84dc8" -dependencies = [ - "cc", - "tree-sitter-language", -] - -[[package]] -name = "tree-sitter-scss" -version = "1.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "33909a9ca86390ebbf3461e9949c4bbe2767d2d024b486306d27616641d4ba24" -dependencies = [ - "cc", - "tree-sitter", -] - -[[package]] -name = "tree-sitter-svelte-ng" -version = "1.0.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ef0a71f9cf5e94373cc86c64893630c8a29bb25d3390a248268d08af2165fa37" -dependencies = [ - "cc", - "tree-sitter-language", -] - -[[package]] -name = "tree-sitter-toml-ng" -version = "0.6.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "695d20cd83acf16c02c773f03e76d7b43b19883d4e2ce3652a8f06b5e0da7455" -dependencies = [ - "cc", - "tree-sitter", -] - -[[package]] -name = "tree-sitter-typescript" -version = "0.23.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "aecf1585ae2a9dddc2b1d4c0e2140b2ec9876e2a25fd79de47fcf7dae0384685" -dependencies = [ - "cc", - "tree-sitter-language", -] - -[[package]] -name = "tree-sitter-yaml" -version = "0.6.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "aad27ec46ad343d8b514f64dd3fdffb478c592ece561b6c935d90ef55589c6b6" -dependencies = [ - "cc", - "tree-sitter", -] - [[package]] name = "trim-in-place" version = "0.1.7" diff --git a/Cargo.toml b/Cargo.toml index f578351..a997f3e 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -6,6 +6,9 @@ edition = "2021" authors = ["Jordan Doyle "] license = "WTFPL" +[workspace] +members = ["tree-sitter-grammar-repository"] + # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html [dependencies] @@ -55,32 +58,8 @@ tower-layer = "0.3" tower-service = "0.3" tracing = "0.1" tracing-subscriber = { version = "0.3", features = ["env-filter"] } -tree-sitter-bash = "0.23" -tree-sitter-c = "0.23" -tree-sitter-cpp = "0.23" -tree-sitter-c-sharp = "0.23" -tree-sitter-elixir = "0.3" -tree-sitter-go = "0.23" -tree-sitter-php = "0.23" -tree-sitter-json = "0.23" -tree-sitter-ocaml = "0.23" -tree-sitter-python = "0.23" -tree-sitter-regex = "0.23" -tree-sitter-ruby = "0.23" -tree-sitter-css = "0.23" -tree-sitter-fortran = "0.1" -tree-sitter-haskell = "0.23" +tree-sitter-grammar-repository = { path = "./tree-sitter-grammar-repository" } tree-sitter-highlight = "0.23" -tree-sitter-html = "0.23" -tree-sitter-java = "0.23" -tree-sitter-javascript = "0.23" -tree-sitter-md = "0.3" -tree-sitter-rust = "0.23" -tree-sitter-scss = "1.0" -tree-sitter-svelte-ng = "1.0" -tree-sitter-toml-ng = "0.6.0" -tree-sitter-typescript = "0.23" -tree-sitter-yaml = "0.6" unix_mode = "0.1" uuid = { version = "1.7", features = ["v4"] } v_htmlescape = { version = "0.15", features = ["bytes-buf"] } diff --git a/flake.lock b/flake.lock index 4041d5d..3c29400 100644 --- a/flake.lock +++ b/flake.lock @@ -31,6 +31,22 @@ "type": "github" } }, + "helix": { + "flake": false, + "locked": { + "lastModified": 1727613050, + "narHash": "sha256-vxf/5aCNjy0OKzkkkNoeUnjr1lWQDmcKW+UXKpU4weE=", + "owner": "helix-editor", + "repo": "helix", + "rev": "2ce4c6d5fa3e50464b41a3d0190ad0e5ada2fc3c", + "type": "github" + }, + "original": { + "owner": "helix-editor", + "repo": "helix", + "type": "github" + } + }, "nixpkgs": { "locked": { "lastModified": 1727335715, @@ -65,6 +81,7 @@ "inputs": { "advisory-db": "advisory-db", "crane": "crane", + "helix": "helix", "nixpkgs": "nixpkgs", "treefmt-nix": "treefmt-nix", "utils": "utils" diff --git a/flake.nix b/flake.nix index 0303b38..5153365 100644 --- a/flake.nix +++ b/flake.nix @@ -8,14 +8,28 @@ url = "github:rustsec/advisory-db"; flake = false; }; + + helix = { + url = "github:helix-editor/helix"; + flake = false; + }; }; - outputs = { self, nixpkgs, utils, crane, advisory-db, treefmt-nix }: + outputs = { self, nixpkgs, utils, crane, advisory-db, treefmt-nix, helix }: utils.lib.eachDefaultSystem (system: let pkgs = import nixpkgs { inherit system; }; craneLib = crane.mkLib pkgs; src = craneLib.cleanCargoSource ./.; + helix-grammar = pkgs.callPackage "${helix}/grammars.nix" { inherit pkgs; }; + rgit-grammar = pkgs.runCommand "consolidated-rgit-grammars" { } '' + mkdir -p $out + for file in ${helix-grammar}/*; do + ln -s "$file" "$out" + done + ln -s "${helix}/languages.toml" "$out/languages.toml" + ln -s "${helix}/runtime/queries" "$out/queries" + ''; commonArgs = { inherit src; strictDeps = true; @@ -23,6 +37,7 @@ nativeBuildInputs = with pkgs; [ cmake clang ]; LIBCLANG_PATH = "${pkgs.clang.cc.lib}/lib"; ROCKSDB_LIB_DIR = "${pkgs.rocksdb}/lib"; + TREE_SITTER_GRAMMAR_LIB_DIR = "${rgit-grammar}"; }; cargoArtifacts = craneLib.buildDepsOnly commonArgs; rgit = craneLib.buildPackage (commonArgs // { @@ -33,6 +48,7 @@ fileset = pkgs.lib.fileset.unions [ ./Cargo.toml ./Cargo.lock + ./tree-sitter-grammar-repository ./src ./statics ./templates diff --git a/src/git.rs b/src/git.rs index 67eda1b..1ab96a7 100644 --- a/src/git.rs +++ b/src/git.rs @@ -1,3 +1,15 @@ +use std::{ + borrow::Cow, + collections::{BTreeMap, VecDeque}, + ffi::OsStr, + fmt::{self, Arguments, Write}, + io::ErrorKind, + path::{Path, PathBuf}, + str::FromStr, + sync::Arc, + time::Duration, +}; + use anyhow::{anyhow, Context, Result}; use axum::response::IntoResponse; use bytes::{buf::Writer, BufMut, Bytes, BytesMut}; @@ -7,8 +19,7 @@ use gix::{ actor::SignatureRef, bstr::{BStr, BString, ByteSlice, ByteVec}, diff::blob::{platform::prepare_diff::Operation, Sink}, - object::tree::EntryKind, - object::Kind, + object::{tree::EntryKind, Kind}, objs::tree::EntryRef, prelude::TreeEntryRefExt, traverse::tree::visit::Action, @@ -16,17 +27,6 @@ use gix::{ ObjectId, ThreadSafeRepository, Url, }; use moka::future::Cache; -use std::{ - borrow::Cow, - collections::{BTreeMap, VecDeque}, - ffi::OsStr, - fmt::{self, Arguments, Write}, - io::ErrorKind, - path::{Path, PathBuf}, - str::FromStr, - sync::Arc, - time::Duration, -}; use tar::Builder; use time::{OffsetDateTime, UtcOffset}; use tracing::{error, instrument, warn}; @@ -144,21 +144,15 @@ impl OpenRepository { match object.kind { Kind::Blob => { - let path = path.join(item.filename().to_path_lossy()); let mut blob = object.into_blob(); let size = blob.data.len(); - let extension = path - .extension() - .or_else(|| path.file_name()) - .and_then(OsStr::to_str) - .unwrap_or_default(); let content = match (formatted, simdutf8::basic::from_utf8(&blob.data)) { (true, Err(_)) => Content::Binary(vec![]), (true, Ok(data)) => Content::Text(Cow::Owned(format_file( data, - FileIdentifier::Extension(extension), + FileIdentifier::Path(path.as_path()), )?)), (false, Err(_)) => Content::Binary(blob.take_data()), (false, Ok(_data)) => Content::Text(Cow::Owned(unsafe { @@ -1091,29 +1085,17 @@ impl Callback for PlainDiffFormatter { } struct SyntaxHighlightedDiffFormatter<'a> { - extension: &'a str, + path: &'a Path, } impl<'a> SyntaxHighlightedDiffFormatter<'a> { fn new(path: &'a Path) -> Self { - let extension = path - .extension() - .or_else(|| path.file_name()) - .and_then(OsStr::to_str) - .unwrap_or_default(); - - Self { extension } + Self { path } } fn write(&self, output: &mut String, class: &str, data: &str) { write!(output, r#""#).unwrap(); - format_file_inner( - output, - data, - FileIdentifier::Extension(self.extension), - false, - ) - .unwrap(); + format_file_inner(output, data, FileIdentifier::Path(self.path), false).unwrap(); write!(output, r#""#).unwrap(); } } diff --git a/src/main.rs b/src/main.rs index d33fa01..e55d0d1 100644 --- a/src/main.rs +++ b/src/main.rs @@ -42,6 +42,7 @@ use crate::{ }, git::Git, layers::logger::LoggingMiddleware, + syntax_highlight::prime_highlighters, theme::Theme, }; @@ -190,6 +191,10 @@ async fn main() -> Result<(), anyhow::Error> { } }; + info!("Priming highlighters..."); + prime_highlighters(); + info!("Server starting up..."); + let app = Router::new() .route("/", get(methods::index::handle)) .route( diff --git a/src/syntax_highlight.rs b/src/syntax_highlight.rs index d75713f..3874df5 100644 --- a/src/syntax_highlight.rs +++ b/src/syntax_highlight.rs @@ -3,11 +3,13 @@ use std::{ collections::HashMap, fmt::Write as FmtWrite, io::{ErrorKind, Write as IoWrite}, + path::Path, sync::LazyLock, }; use comrak::adapters::SyntaxHighlighterAdapter; use tracing::debug; +use tree_sitter_grammar_repository::{Grammar, Language}; use tree_sitter_highlight::{HighlightConfiguration, HighlightEvent, Highlighter}; thread_local! { @@ -33,152 +35,95 @@ macro_rules! define_classes { } define_classes! { - "keyword.directive" => "keyword directive", - "markup.strikethrough" => "markup strikethrough", - "markup.link" => "markup link", - "keyword.control.conditional" => "keyword control conditional", - "markup.bold" => "markup bold", - "diff.plus" => "diff plus", - "markup.heading.2" => "markup heading 2", - "markup" => "markup", - "diff.delta" => "diff delta", - "variable.other.member" => "variable other member", - "namespace" => "namespace", - "comment.line" => "comment line", - "function" => "function", - "keyword.operator" => "keyword operator", - "punctuation.bracket" => "punctuation bracket", - "markup.list" => "markup list", - "type.builtin" => "type builtin", - "keyword.storage.modifier" => "keyword storage modifier", - "constant" => "constant", - "markup.italic" => "markup italic", - "variable" => "variable", - "keyword" => "keyword", - "punctuation.special" => "punctuation special", - "string.special.path" => "string special path", - "keyword.storage.type" => "keyword storage type", - "markup.heading.5" => "markup heading 5", - "markup.heading.6" => "markup heading 6", - "markup.link.label" => "markup link label", - "markup.list.numbered" => "markup list numbered", - "diff.delta.moved" => "diff delta moved", - "constant.numeric" => "constant numeric", - "markup.heading" => "markup heading", - "markup.link.text" => "markup link text", - "keyword.function" => "keyword function", - "string.special.url" => "string special url", - "keyword.control.return" => "keyword control return", - "keyword.control.repeat" => "keyword control repeat", - "constant.builtin" => "constant builtin", - "type.enum.variant" => "type enum variant", - "markup.raw.block" => "markup raw block", - "markup.heading.3" => "markup heading 3", - "escape" => "escape", - "comment.block" => "comment block", - "constant.numeric.integer" => "constant numeric integer", - "punctuation.delimiter" => "punctuation delimiter", - "constructor" => "constructor", - "type" => "type", - "string.regexp" => "string regexp", - "variable.parameter" => "variable parameter", - "markup.quote" => "markup quote", - "string.special" => "string special", - "constant.numeric.float" => "constant numeric float", - "constant.character.escape" => "constant character escape", - "tag" => "tag", - "keyword.storage" => "keyword storage", - "string" => "string", - "function.macro" => "function macro", - "markup.list.unnumbered" => "markup list unnumbered", - "diff.minus" => "diff minus", - "punctuation" => "punctuation", - "markup.link.url" => "markup link url", - "function.method" => "function method", - "markup.raw" => "markup raw", - "function.special" => "function special", - "attribute" => "attribute", - "operator" => "operator", - "special" => "special", - "function.builtin" => "function builtin", - "diff" => "diff", - "markup.heading.4" => "markup heading 4", - "keyword.control" => "keyword control", - "markup.list.unchecked" => "markup list unchecked", - "keyword.control.exception" => "keyword control exception", - "constant.builtin.boolean" => "constant builtin boolean", - "markup.heading.1" => "markup heading 1", - "markup.heading.marker" => "markup heading marker", - "constant.character" => "constant character", - "markup.raw.inline" => "markup raw inline", - "variable.builtin" => "variable builtin", - "variable.other" => "variable other", - "tag.builtin" => "tag builtin", - "type.enum" => "type enum", - "comment.block.documentation" => "comment block documentation", - "comment" => "comment", - "string.special.symbol" => "string special symbol", - "label" => "label", - "keyword.control.import" => "keyword control import", - "markup.list.checked" => "markup list checked", +"attribute" => "attribute", +"boolean" => "boolean", +"carriage-return" => "carriage-return", +"comment" => "comment", +"comment.documentation" => "comment documentation", +"constant" => "constant", +"constant.builtin" => "constant builtin", +"constructor" => "constructor", +"constructor.builtin" => "constructor builtin", +"embedded" => "embedded", +"error" => "error", +"escape" => "escape", +"function" => "function", +"function.builtin" => "function builtin", +"keyword" => "keyword", +"markup" => "markup", +"markup.bold" => "markup bold", +"markup.heading" => "markup heading", +"markup.italic" => "markup italic", +"markup.link" => "markup link", +"markup.link.url" => "markup link url", +"markup.list" => "markup list", +"markup.list.checked" => "markup list checked", +"markup.list.numbered" => "markup list numbered", +"markup.list.unchecked" => "markup list unchecked", +"markup.list.unnumbered" => "markup list unnumbered", +"markup.quote" => "markup quote", +"markup.raw" => "markup raw", +"markup.raw.block" => "markup raw block", +"markup.raw.inline" => "markup raw inline", +"markup.strikethrough" => "markup strikethrough", +"module" => "module", +"number" => "number", +"operator" => "operator", +"property" => "property", +"property.builtin" => "property builtin", +"punctuation" => "punctuation", +"punctuation.bracket" => "punctuation bracket", +"punctuation.delimiter" => "punctuation delimiter", +"punctuation.special" => "punctuation special", +"string" => "string", +"string.escape" => "string escape", +"string.regexp" => "string regexp", +"string.special" => "string special", +"string.special.symbol" => "string special symbol", +"tag" => "tag", +"type" => "type", +"type.builtin" => "type builtin", +"variable" => "variable", +"variable.builtin" => "variable builtin", +"variable.member" => "variable member", +"variable.parameter" => "variable parameter",} + +pub fn prime_highlighters() { + let _res = HIGHLIGHTER_CONFIGS.len(); } -macro_rules! build_highlighter_configs { - ($($i:literal => $($extension:literal)|* => $($token:literal)|* => $config:expr),*,) => { - static BUILD_HIGHLIGHTER_CONFIGS: LazyLock<[HighlightConfiguration; count!($($config),*)]> = LazyLock::new(|| [ - $({ - let mut config = $config.unwrap(); - config.configure(&HIGHLIGHT_NAMES); - config - }),* - ]); - - pub fn fetch_highlighter_config(extension: &str) -> Option<&'static HighlightConfiguration> { - match extension { - $($($extension)|* => Some(&BUILD_HIGHLIGHTER_CONFIGS[$i])),*, - _ => None, - } - } +static HIGHLIGHTER_CONFIGS: LazyLock> = LazyLock::new(|| { + Grammar::VARIANTS + .iter() + .copied() + .map(Grammar::highlight_configuration_params) + .map(|v| { + let mut configuration = HighlightConfiguration::new( + v.language.into(), + v.name, + v.highlights_query, + v.injection_query, + v.locals_query, + ) + .unwrap_or_else(|e| panic!("bad query for {}: {e}", v.name)); + configuration.configure(&HIGHLIGHT_NAMES); + configuration + }) + .collect() +}); - pub fn fetch_highlighter_config_by_token(extension: &str) -> Option<&'static HighlightConfiguration> { - match extension { - $($($token)|* => Some(&BUILD_HIGHLIGHTER_CONFIGS[$i])),*, - _ => None, - } - } - }; +pub fn fetch_highlighter_config(file: &Path) -> Option<&'static HighlightConfiguration> { + Language::from_file_name(file) + .map(Language::grammar) + .map(Grammar::idx) + .map(|idx| &HIGHLIGHTER_CONFIGS[idx]) } -build_highlighter_configs! { - // # extensions name/aliases - 0 => "java" => "java" => HighlightConfiguration::new(tree_sitter_java::LANGUAGE.into(), "java", tree_sitter_java::HIGHLIGHTS_QUERY, "", ""), - 1 => "html" => "html" => HighlightConfiguration::new(tree_sitter_html::LANGUAGE.into(), "html", include_str!("../grammar/html/highlights.scm"), include_str!("../grammar/html/injections.scm"), ""), - 2 => "md" => "markdown" => HighlightConfiguration::new(tree_sitter_md::LANGUAGE.into(), "markdown", tree_sitter_md::HIGHLIGHT_QUERY_BLOCK, tree_sitter_md::INJECTION_QUERY_BLOCK, ""), - 3 => "rs" => "rust" => HighlightConfiguration::new(tree_sitter_rust::LANGUAGE.into(), "rust", tree_sitter_rust::HIGHLIGHTS_QUERY, tree_sitter_rust::INJECTIONS_QUERY, ""), - 4 => "toml" => "toml" => HighlightConfiguration::new(tree_sitter_toml_ng::language(), "toml", tree_sitter_toml_ng::HIGHLIGHTS_QUERY, "", ""), - 5 => "yaml" | "yml" => "yaml" | "yml" => HighlightConfiguration::new(tree_sitter_yaml::language(), "yaml", tree_sitter_yaml::HIGHLIGHTS_QUERY, "", ""), - 6 => "hs" => "haskell" => HighlightConfiguration::new(tree_sitter_haskell::LANGUAGE.into(), "haskell", tree_sitter_haskell::HIGHLIGHTS_QUERY, tree_sitter_haskell::INJECTIONS_QUERY, tree_sitter_haskell::LOCALS_QUERY), - 7 => "f" | "f90" | "for" => "fortran" => HighlightConfiguration::new(tree_sitter_fortran::LANGUAGE.into(), "fortran", include_str!("../grammar/fortran/highlights.scm"), "", ""), - 8 => "svelte" => "svelte" => HighlightConfiguration::new(tree_sitter_svelte_ng::LANGUAGE.into(), "svelte", tree_sitter_svelte_ng::HIGHLIGHTS_QUERY, tree_sitter_svelte_ng::INJECTIONS_QUERY, tree_sitter_svelte_ng::LOCALS_QUERY), - 9 => "js" => "js" | "javascript" => HighlightConfiguration::new(tree_sitter_javascript::LANGUAGE.into(), "javascript", tree_sitter_javascript::HIGHLIGHT_QUERY, tree_sitter_javascript::INJECTIONS_QUERY, tree_sitter_javascript::LOCALS_QUERY), - 10 => "jsx" => "jsx" => HighlightConfiguration::new(tree_sitter_javascript::LANGUAGE.into(), "jsx", tree_sitter_javascript::JSX_HIGHLIGHT_QUERY, tree_sitter_javascript::INJECTIONS_QUERY, tree_sitter_javascript::LOCALS_QUERY), - 11 => "ts" => "ts" | "typescript" => HighlightConfiguration::new(tree_sitter_typescript::LANGUAGE_TYPESCRIPT.into(), "typescript", tree_sitter_typescript::HIGHLIGHTS_QUERY, "", ""), - 12 => "tsx" => "tsx" => HighlightConfiguration::new(tree_sitter_typescript::LANGUAGE_TSX.into(), "tsx", tree_sitter_typescript::HIGHLIGHTS_QUERY, "", ""), - 13 => "scss" => "scss" => HighlightConfiguration::new(tree_sitter_scss::language(), "scss", tree_sitter_scss::HIGHLIGHTS_QUERY, "", ""), - 14 => "css" => "css" => HighlightConfiguration::new(tree_sitter_css::LANGUAGE.into(), "css", tree_sitter_css::HIGHLIGHTS_QUERY, "", ""), - 15 => "bash" | "sh" => "bash" | "shell" | "sh" => HighlightConfiguration::new(tree_sitter_bash::LANGUAGE.into(), "css", tree_sitter_bash::HIGHLIGHT_QUERY, "", ""), - 16 => "c" => "c" => HighlightConfiguration::new(tree_sitter_c::LANGUAGE.into(), "c", tree_sitter_c::HIGHLIGHT_QUERY, "", ""), - 17 => "cpp" | "c++" => "cpp" | "c++" => HighlightConfiguration::new(tree_sitter_cpp::LANGUAGE.into(), "c++", tree_sitter_cpp::HIGHLIGHT_QUERY, "", ""), - 18 => "cs" => "c#" | "cs" | "csharp" => HighlightConfiguration::new(tree_sitter_c_sharp::LANGUAGE.into(), "c#", tree_sitter_c_sharp::HIGHLIGHTS_QUERY, "", ""), - 19 => "ex" | "exs" => "elixir" => HighlightConfiguration::new(tree_sitter_elixir::LANGUAGE.into(), "elixir", tree_sitter_elixir::HIGHLIGHTS_QUERY, tree_sitter_elixir::INJECTIONS_QUERY, ""), - 21 => "go" => "go" | "golang" => HighlightConfiguration::new(tree_sitter_go::LANGUAGE.into(), "go", tree_sitter_go::HIGHLIGHTS_QUERY, "", ""), - 22 => "php" => "php" => HighlightConfiguration::new(tree_sitter_php::LANGUAGE_PHP.into(), "php", tree_sitter_php::HIGHLIGHTS_QUERY, tree_sitter_php::INJECTIONS_QUERY, ""), - 23 => "json" => "json" => HighlightConfiguration::new(tree_sitter_json::LANGUAGE.into(), "json", tree_sitter_json::HIGHLIGHTS_QUERY, "", ""), - 24 => "ml" => "ml" | "ocaml" => HighlightConfiguration::new(tree_sitter_ocaml::LANGUAGE_OCAML.into(), "ocaml", tree_sitter_ocaml::HIGHLIGHTS_QUERY, "", tree_sitter_ocaml::LOCALS_QUERY), - 25 => "mli" => "mli" | "ocaml-interface" => HighlightConfiguration::new(tree_sitter_ocaml::LANGUAGE_OCAML_INTERFACE.into(), "ocaml", tree_sitter_ocaml::HIGHLIGHTS_QUERY, "", tree_sitter_ocaml::LOCALS_QUERY), - 26 => "py" => "py" | "python" => HighlightConfiguration::new(tree_sitter_python::LANGUAGE.into(), "python", tree_sitter_python::HIGHLIGHTS_QUERY, "", ""), - 27 => "regex" => "regex" => HighlightConfiguration::new(tree_sitter_regex::LANGUAGE.into(), "regex", tree_sitter_regex::HIGHLIGHTS_QUERY, "", ""), - 28 => "rb" => "rb" | "ruby" => HighlightConfiguration::new(tree_sitter_ruby::LANGUAGE.into(), "ruby", tree_sitter_ruby::HIGHLIGHTS_QUERY, "", tree_sitter_ruby::LOCALS_QUERY), +pub fn fetch_highlighter_config_by_token(token: &str) -> Option<&'static HighlightConfiguration> { + Language::from_injection(token) + .map(Language::grammar) + .map(Grammar::idx) + .map(|idx| &HIGHLIGHTER_CONFIGS[idx]) } pub struct ComrakHighlightAdapter; @@ -212,9 +157,9 @@ impl SyntaxHighlighterAdapter for ComrakHighlightAdapter { } } -#[derive(Copy, Clone)] +#[derive(Copy, Clone, Debug)] pub enum FileIdentifier<'a> { - Extension(&'a str), + Path(&'a Path), Token(&'a str), } @@ -231,10 +176,22 @@ pub fn format_file_inner( code_tag: bool, ) -> anyhow::Result<()> { let config = match identifier { - FileIdentifier::Extension(v) => fetch_highlighter_config(v), + FileIdentifier::Path(v) => fetch_highlighter_config(v), FileIdentifier::Token(v) => fetch_highlighter_config_by_token(v), }; + if let Some(config) = config { + eprintln!("------------------------------------------------------------"); + eprintln!("{}", config.language_name); + eprintln!("{:?}", config.query); + eprintln!("{:?}", config.names()); + eprintln!("------------------------------------------------------------"); + } else { + eprintln!("------------------------------------------------------------"); + eprintln!("{identifier:?} DID NOT MATCH ANYTHING"); + eprintln!("------------------------------------------------------------"); + } + let line_prefix = if code_tag { "" } else { "" }; let line_suffix = if code_tag { "\n" } else { "\n" }; @@ -250,9 +207,9 @@ pub fn format_file_inner( }; HIGHLIGHTER.with_borrow_mut(|highlighter| { - let mut spans = highlighter.highlight(config, content.as_bytes(), None, |extension| { - debug!(extension, "Highlighter switch requested"); - fetch_highlighter_config(extension).or(fetch_highlighter_config_by_token(extension)) + let mut spans = highlighter.highlight(config, content.as_bytes(), None, |injection| { + debug!(injection, "Highlighter switch requested"); + fetch_highlighter_config_by_token(injection) })?; let mut tag_open = true; diff --git a/tree-sitter-grammar-repository/Cargo.toml b/tree-sitter-grammar-repository/Cargo.toml new file mode 100644 index 0000000..7b8d9a3 --- /dev/null +++ b/tree-sitter-grammar-repository/Cargo.toml @@ -0,0 +1,26 @@ +[package] +name = "tree-sitter-grammar-repository" +description = "tree-sitter grammars built from Helix with support for dynamic linking" +version = "0.0.1" +edition = "2021" +authors = ["Jordan Doyle "] +license = "WTFPL" + +[dependencies] +globset = "0.4" +regex = "1.11" +tree-sitter-language = "0.1" + +[build-dependencies] +anyhow = "1.0" +cc = "1.1" +serde = { version = "1.0", features = ["derive"] } +toml = "0.8" +threadpool = "1.8" +quote = "1.0" +proc-macro2 = "1.0" +prettyplease = "0.2" +heck = "0.5" +syn = "2.0" +serde_json = "1.0" +regex = "1.11" diff --git a/tree-sitter-grammar-repository/build.rs b/tree-sitter-grammar-repository/build.rs new file mode 100644 index 0000000..883f763 --- /dev/null +++ b/tree-sitter-grammar-repository/build.rs @@ -0,0 +1,498 @@ +use std::{ + borrow::Cow, + ffi::OsStr, + fmt::Write, + fs, + path::{Path, PathBuf}, + process::Command, + sync::LazyLock, +}; + +use anyhow::{bail, Context}; +use heck::{ToSnakeCase, ToUpperCamelCase}; +use quote::{format_ident, quote}; +use serde::Deserialize; +use threadpool::ThreadPool; + +const GRAMMAR_REPOSITORY_URL: &str = "https://github.com/helix-editor/helix"; +const GRAMMAR_REPOSITORY_REF: &str = "82dd96369302f60a9c83a2d54d021458f82bcd36"; +const GRAMMAR_REPOSITORY_CONFIG_PATH: &str = "languages.toml"; + +static BLACKLISTED_MODULES: &[&str] = &[ + // these languages all don't have corresponding grammars + "cabal", + "idris", + "llvm-mir-yaml", + "prolog", + "mint", +]; + +fn main() -> anyhow::Result<()> { + let out_dir = PathBuf::from(std::env::var("OUT_DIR").context("OUT_DIR not set by rustc")?); + + let root = std::env::var("TREE_SITTER_GRAMMAR_LIB_DIR").ok(); + + let (root, dylib) = if let Some(root) = root.as_deref() { + (Path::new(root), true) + } else { + (out_dir.as_path(), false) + }; + + let (config, query_path) = if dylib { + let config: HelixLanguages = toml::from_str( + &fs::read_to_string(root.join("languages.toml")) + .context("failed to read languages.toml")?, + ) + .context("failed to parse helix languages.toml")?; + + eprintln!("cargo:rustc-link-search=native={}", root.display()); + + for grammar in &config.grammar { + eprintln!("cargo:rustc-link-lib=dylib={}", grammar.name); + } + + (config, root.join("queries")) + } else { + let sources = out_dir.join("sources"); + fs::create_dir_all(&sources)?; + + let helix_root = sources.join("helix"); + + fetch_git_repository(GRAMMAR_REPOSITORY_URL, GRAMMAR_REPOSITORY_REF, &helix_root) + .context(GRAMMAR_REPOSITORY_URL)?; + + let config: HelixLanguages = toml::from_str( + &fs::read_to_string(helix_root.join(GRAMMAR_REPOSITORY_CONFIG_PATH)) + .context("failed to read helix languages.toml")?, + ) + .context("failed to parse helix languages.toml")?; + + fetch_and_build_grammar(config.grammar.clone(), &sources)?; + + (config, helix_root.join("runtime/queries")) + }; + + let mut grammar_defs = Vec::new(); + for grammar in &config.grammar { + let name = &grammar.name; + if let Some(tokens) = + build_language_module(name, query_path.as_path()).with_context(|| name.to_string())? + { + grammar_defs.push(tokens); + } + } + fs::write( + &out_dir.join("grammar.defs.rs"), + prettyplease::unparse( + &syn::parse2(quote!(#(#grammar_defs)*)).context("failed to parse grammar defs")?, + ), + ) + .context("failed to write grammar defs")?; + + let registry = build_grammar_registry(config.grammar.iter().map(|v| v.name.clone())); + fs::write( + &out_dir.join("grammar.registry.rs"), + prettyplease::unparse(&syn::parse2(registry).context("failed to parse grammar registry")?), + ) + .context("failed to write grammar registry")?; + + let language = build_language_registry(config.language)?; + fs::write( + &out_dir.join("language.registry.rs"), + prettyplease::unparse(&syn::parse2(language)?), + )?; + + Ok(()) +} + +fn build_language_registry( + language_definition: Vec, +) -> anyhow::Result { + let mut camel = Vec::new(); + let mut grammars = Vec::new(); + + let mut globs = Vec::new(); + let mut globs_to_camel = Vec::new(); + + let mut injection_regex = Vec::new(); + let mut injection_regex_str_len = Vec::new(); + + for language in &language_definition { + if BLACKLISTED_MODULES.contains(&language.name.as_str()) { + continue; + } + + let camel_cased_name = format_ident!("{}", language.name.to_upper_camel_case()); + camel.push(camel_cased_name.clone()); + + let grammar = language + .grammar + .as_deref() + .unwrap_or(language.name.as_str()); + grammars.push(format_ident!("{}", grammar.to_upper_camel_case())); + + for ty in &language.file_types { + match ty { + FileType::Glob { glob } => globs.push(Cow::Borrowed(glob)), + FileType::Extension(ext) => globs.push(Cow::Owned(format!("*.{ext}"))), + } + + globs_to_camel.push(camel_cased_name.clone()); + } + + if let Some(regex) = language.injection_regex.as_deref() { + injection_regex.push(regex); + injection_regex_str_len.push(regex.len()); + } + } + + let injection_regex_len = injection_regex.len(); + let globs_array_len = globs.len(); + let globs_string_len = globs.iter().map(|v| v.len()).collect::>(); + + Ok(quote! { + #[derive(Copy, Clone, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)] + pub enum Language { + #(#camel),* + } + + impl Language { + pub const VARIANTS: &[Self] = &[ + #(Self::#camel),* + ]; + + pub const fn grammar(self) -> Grammar { + match self { + #(Self::#camel => Grammar::#grammars),* + } + } + + pub fn from_file_name>(name: P) -> Option { + const LENGTHS: [usize; #globs_array_len] = [#(#globs_string_len),*]; + const GLOB_TO_VARIANT: [Language; #globs_array_len] = [#(Language::#globs_to_camel),*]; + + thread_local! { + static GLOB: ::std::cell::LazyCell<::globset::GlobSet> = ::std::cell::LazyCell::new(|| { + ::globset::GlobSetBuilder::new() + #(.add(::globset::Glob::new(#globs).unwrap()))* + .build() + .unwrap() + }); + } + + let mut max = usize::MAX; + let mut curr = None; + + GLOB.with(|glob| { + for m in glob.matches(name) { + let curr_length = LENGTHS[m]; + + if curr_length < max { + max = curr_length; + curr = Some(GLOB_TO_VARIANT[m]); + } + } + }); + + curr + } + + pub fn from_injection(name: &str) -> Option { + const LENGTHS: [usize; #injection_regex_len] = [#(#injection_regex_str_len),*]; + + thread_local! { + static REGEX: ::std::cell::LazyCell<::regex::RegexSet> = ::std::cell::LazyCell::new(|| { + ::regex::RegexSet::new(&[ + #(#injection_regex),* + ]) + .unwrap() + }); + } + + let mut max = usize::MAX; + let mut curr = None; + + REGEX.with(|regex| { + for m in regex.matches(name) { + let curr_length = LENGTHS[m]; + + if curr_length < max { + max = curr_length; + curr = Some(Self::VARIANTS[m]); + } + } + }); + + curr + } + } + }) +} + +fn build_grammar_registry(names: impl Iterator) -> proc_macro2::TokenStream { + let (ids, plain, camel, snake) = names + .filter(|name| !BLACKLISTED_MODULES.contains(&name.as_str())) + .enumerate() + .fold( + (Vec::new(), Vec::new(), Vec::new(), Vec::new()), + |(mut ids, mut plain_acc, mut camel_acc, mut snake_acc), (i, name)| { + camel_acc.push(format_ident!("{}", name.to_upper_camel_case())); + snake_acc.push(format_ident!("{}", name.to_snake_case())); + plain_acc.push(name); + ids.push(i); + (ids, plain_acc, camel_acc, snake_acc) + }, + ); + + quote! { + #[derive(Copy, Clone, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)] + pub enum Grammar { + #(#camel),* + } + + impl Grammar { + pub const VARIANTS: &[Self] = &[ + #(Self::#camel),* + ]; + + pub const fn highlight_configuration_params(self) -> crate::HighlightConfigurationParams { + match self { + #(Self::#camel => crate::HighlightConfigurationParams { + language: crate::grammar::#snake::LANGUAGE, + name: #plain, + highlights_query: crate::grammar::#snake::HIGHLIGHTS_QUERY, + injection_query: crate::grammar::#snake::INJECTIONS_QUERY, + locals_query: crate::grammar::#snake::LOCALS_QUERY, + }),* + } + } + + pub const fn idx(self) -> usize { + match self { + #(Self::#camel => #ids),* + } + } + } + } +} + +fn build_language_module( + name: &str, + query_path: &Path, +) -> anyhow::Result> { + if BLACKLISTED_MODULES.contains(&name) { + return Ok(None); + } + + let highlights_query = read_local_query(query_path, name, "highlights.scm"); + let injections_query = read_local_query(query_path, name, "injections.scm"); + let locals_query = read_local_query(query_path, name, "locals.scm"); + + let name = format_ident!("{}", name.to_snake_case()); + let ffi = format_ident!("tree_sitter_{name}"); + + Ok(Some(quote! { + pub mod #name { + extern "C" { + fn #ffi() -> *const (); + } + + pub const LANGUAGE: tree_sitter_language::LanguageFn = unsafe { tree_sitter_language::LanguageFn::from_raw(#ffi) }; + pub const HIGHLIGHTS_QUERY: &str = #highlights_query; + pub const INJECTIONS_QUERY: &str = #injections_query; + pub const LOCALS_QUERY: &str = #locals_query; + } + })) +} + +// taken from https://github.com/helix-editor/helix/blob/2ce4c6d5fa3e50464b41a3d0190ad0e5ada2fc3c/helix-core/src/syntax.rs#L721 +fn read_local_query(query_path: &Path, language: &str, filename: &str) -> String { + static INHERITS_REGEX: LazyLock = + LazyLock::new(|| regex::Regex::new(r";+\s*inherits\s*:?\s*([a-z_,()-]+)\s*").unwrap()); + + let path = query_path.join(language).join(filename); + + if !path.exists() { + return String::new(); + } + + let query = + fs::read_to_string(&path).unwrap_or_else(|e| panic!("failed to fetch {path:?}: {e:?}")); + + INHERITS_REGEX + .replace_all(&query, |captures: ®ex::Captures| { + captures[1] + .split(',') + .fold(String::new(), |mut output, language| { + // `write!` to a String cannot fail. + write!( + output, + "\n{}\n", + read_local_query(query_path, language, filename) + ) + .unwrap(); + output + }) + }) + .to_string() +} + +fn fetch_and_build_grammar( + grammars: Vec, + source_dir: &Path, +) -> anyhow::Result<()> { + let pool = ThreadPool::new(std::thread::available_parallelism()?.get()); + + for grammar in grammars { + if BLACKLISTED_MODULES.contains(&grammar.name.as_str()) { + continue; + } + + let mut grammar_root = source_dir.join(&grammar.name); + + pool.execute(move || { + let grammar_root = match grammar.source { + GrammarSource::Git { + remote, + revision, + subpath, + } => { + fetch_git_repository(&remote, &revision, &grammar_root) + .context(GRAMMAR_REPOSITORY_URL) + .expect("failed to fetch git repository"); + + if let Some(subpath) = subpath { + grammar_root.push(subpath); + } + + grammar_root + } + GrammarSource::Local { path } => path, + }; + + let grammar_src = grammar_root.join("src"); + + let parser_file = Some(grammar_src.join("parser.c")) + .filter(|s| s.exists()) + .or_else(|| Some(grammar_src.join("parser.cc"))) + .filter(|s| s.exists()); + let scanner_file = Some(grammar_src.join("scanner.c")) + .filter(|s| s.exists()) + .or_else(|| Some(grammar_src.join("scanner.cc"))) + .filter(|s| s.exists()); + + if let Some(parser_file) = parser_file { + cc::Build::new() + .cpp(parser_file.extension() == Some(OsStr::new("cc"))) + .file(parser_file) + .flag_if_supported("-w") + .flag_if_supported("-s") + .include(&grammar_src) + .compile(&format!("{}-parser", grammar.name)); + } + + if let Some(scanner_file) = scanner_file { + cc::Build::new() + .cpp(scanner_file.extension() == Some(OsStr::new("cc"))) + .file(scanner_file) + .flag_if_supported("-w") + .flag_if_supported("-s") + .include(&grammar_src) + .compile(&format!("{}-scanner", grammar.name)); + } + }); + } + + pool.join(); + + Ok(()) +} + +fn fetch_git_repository(url: &str, ref_: &str, destination: &Path) -> anyhow::Result<()> { + if !destination.exists() { + let res = Command::new("git").arg("init").arg(&destination).status()?; + if !res.success() { + bail!("git init failed with exit code {res}"); + } + + let res = Command::new("git") + .args(&["remote", "add", "origin", url]) + .current_dir(&destination) + .status()?; + if !res.success() { + bail!("git remote failed with exit code {res}"); + } + } + + let res = Command::new("git") + .args(&["rev-parse", "HEAD"]) + .current_dir(&destination) + .output()? + .stdout; + if res == ref_.as_bytes() { + return Ok(()); + } + + let res = Command::new("git") + .args(&["fetch", "--depth", "1", "origin", ref_]) + .current_dir(&destination) + .status()?; + if !res.success() { + bail!("git fetch failed with exit code {res}"); + } + + let res = Command::new("git") + .args(&["reset", "--hard", ref_]) + .current_dir(&destination) + .status()?; + if !res.success() { + bail!("git fetch failed with exit code {res}"); + } + + Ok(()) +} + +#[derive(Deserialize)] +#[serde(rename_all = "kebab-case")] +struct LanguageDefinition { + name: String, + injection_regex: Option, + file_types: Vec, + grammar: Option, +} + +#[derive(Deserialize)] +#[serde(untagged)] +pub enum FileType { + Glob { glob: String }, + Extension(String), +} + +#[derive(Deserialize, Clone)] +#[serde(rename_all = "kebab-case")] +pub struct GrammarDefinition { + name: String, + source: GrammarSource, +} + +#[derive(Deserialize, Clone)] +#[serde(rename_all = "lowercase", untagged)] +enum GrammarSource { + Git { + #[serde(rename = "git")] + remote: String, + #[serde(rename = "rev")] + revision: String, + subpath: Option, + }, + Local { + path: PathBuf, + }, +} + +#[derive(Deserialize)] +#[serde(rename_all = "kebab-case")] +struct HelixLanguages { + language: Vec, + grammar: Vec, +} diff --git a/tree-sitter-grammar-repository/src/lib.rs b/tree-sitter-grammar-repository/src/lib.rs new file mode 100644 index 0000000..ab1047a --- /dev/null +++ b/tree-sitter-grammar-repository/src/lib.rs @@ -0,0 +1,66 @@ +//! # tree-sitter-grammar-repository +//! +//! This crate loads in all known languages and grammars from `helix`'s +//! `languages.toml` at compile time and provides an easy way for you +//! to easily map the language to a highlighter configuration. +//! +//! `tree-sitter` grammars can be dynamically linked by setting the +//! `TREE_SITTER_GRAMMAR_LIB_DIR` environment variable. If set, this library +//! expects a directory of the format: +//! +//! ```text +//! - TREE_SITTER_GRAMMAR_LIB_DIR +//! - sources/ +//! - html/ +//! - queries/ +//! - highlights.scm +//! - injections.scm +//! - package.json +//! - javascript/ +//! - queries/ +//! - highlights.scm +//! - injections.scm +//! - package.json +//! - libhtml-parser.so +//! - libhtml-scanner.so +//! - libjavascsript-scanner.so +//! - ... +//! ``` +//! +//! Usage: +//! +//! ```ignore +//! use std::collections::HashMap; +//! use tree_sitter_grammar_repository::Grammar; +//! use tree_sitter_highlight::HighlightConfiguration; +//! +//! let highlighter_configurations = Grammar::VARIANTS +//! .iter() +//! .copied() +//! .map(Grammar::highlight_configuration_params) +//! .map(|v| (v, HighlightConfiguration::new( +//! v.language.into(), +//! v.name, +//! v.highlights_query, +//! v.injection_query, +//! v.locals_query +//! ))) +//! .collect::>(); +//! +//! let highlighter_configuration = highlighter_configurations +//! .get(&Language::from_file_name("hello_world.toml").grammar()); +//! ``` + +include!(concat!(env!("OUT_DIR"), "/grammar.registry.rs")); +include!(concat!(env!("OUT_DIR"), "/language.registry.rs")); +pub mod grammar { + include!(concat!(env!("OUT_DIR"), "/grammar.defs.rs")); +} + +pub struct HighlightConfigurationParams { + pub language: tree_sitter_language::LanguageFn, + pub name: &'static str, + pub highlights_query: &'static str, + pub injection_query: &'static str, + pub locals_query: &'static str, +}