From 45d98216c26e0f92c4b756940e4f65cac37b7df7 Mon Sep 17 00:00:00 2001 From: Avimitin Date: Fri, 30 Aug 2024 13:41:42 +0800 Subject: [PATCH] [tests] downgrade tokenizer to 0.13 to fix unhandle ge.Scalar Aten IR node Signed-off-by: Avimitin --- nix/pkgs/buddy-mlir.nix | 24 +++++- nix/pkgs/tokenizer-013.nix | 146 +++++++++++++++++++++++++++++++++++++ 2 files changed, 167 insertions(+), 3 deletions(-) create mode 100644 nix/pkgs/tokenizer-013.nix diff --git a/nix/pkgs/buddy-mlir.nix b/nix/pkgs/buddy-mlir.nix index be44bcaae..8b4fa97be 100644 --- a/nix/pkgs/buddy-mlir.nix +++ b/nix/pkgs/buddy-mlir.nix @@ -2,7 +2,8 @@ , ninja , llvmPackages_17 , fetchFromGitHub -, fetchpatch +, fetchurl +, rustPlatform , python3 , callPackage }: @@ -10,6 +11,23 @@ let stdenv = llvmPackages_17.stdenv; bintools = llvmPackages_17.bintools; + downgradedPyPkgs = python3.override { + packageOverrides = final: prev: { + tokenizers = (final.callPackage ./tokenizer-013.nix { }); + + transformers = (prev.transformers.overridePythonAttrs (old: rec { + version = "4.33.1"; + + src = fetchFromGitHub { + owner = "huggingface"; + repo = "transformers"; + rev = "refs/tags/v${version}"; + hash = "sha256-Z78I9S8g9WexoX6HhxwbOD0K0awCTzsqW1ZiWObQNw0="; + }; + })); + }; + }; + buddy-llvm = callPackage ./buddy-llvm.nix { inherit stdenv python3; }; self = stdenv.mkDerivation { pname = "buddy-mlir"; @@ -49,12 +67,12 @@ let llvm = buddy-llvm; # Below three fields are black magic that allow site-packages automatically imported with nixpkgs hooks - pythonModule = python3; + pythonModule = downgradedPyPkgs; pythonPath = [ ]; requiredPythonModules = [ ]; # nix run buddy-mlir.pyenv to start a python with PyTorch/LLVM MLIR/Buddy Frontend support - pyenv = python3.withPackages (ps: [ + pyenv = downgradedPyPkgs.withPackages (ps: [ self ps.torch diff --git a/nix/pkgs/tokenizer-013.nix b/nix/pkgs/tokenizer-013.nix new file mode 100644 index 000000000..cd14b7b02 --- /dev/null +++ b/nix/pkgs/tokenizer-013.nix @@ -0,0 +1,146 @@ +{ lib +, buildPythonPackage +, datasets +, fetchFromGitHub +, fetchurl +, numpy +, openssl +, pkg-config +, pytestCheckHook +, pythonOlder +, requests +, rustPlatform +, setuptools-rust +}: + +let + # See https://github.com/huggingface/tokenizers/blob/main/bindings/python/tests/utils.py for details + # about URLs and file names + robertaVocab = fetchurl { + url = "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-vocab.json"; + sha256 = "0m86wpkfb2gdh9x9i9ng2fvwk1rva4p0s98xw996nrjxs7166zwy"; + }; + robertaMerges = fetchurl { + url = "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-merges.txt"; + sha256 = "1idd4rvkpqqbks51i2vjbd928inw7slij9l4r063w3y5fd3ndq8w"; + }; + albertVocab = fetchurl { + url = "https://s3.amazonaws.com/models.huggingface.co/bert/albert-base-v1-tokenizer.json"; + sha256 = "1hra9pn8rczx7378z88zjclw2qsdrdwq20m56sy42s2crbas6akf"; + }; + bertVocab = fetchurl { + url = "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt"; + sha256 = "18rq42cmqa8zanydsbzrb34xwy4l6cz1y900r4kls57cbhvyvv07"; + }; + norvigBig = fetchurl { + url = "https://norvig.com/big.txt"; + sha256 = "0yz80icdly7na03cfpl0nfk5h3j3cam55rj486n03wph81ynq1ps"; + }; + docPipelineTokenizer = fetchurl { + url = "https://s3.amazonaws.com/models.huggingface.co/bert/anthony/doc-pipeline/tokenizer.json"; + hash = "sha256-i533xC8J5CDMNxBjo+p6avIM8UOcui8RmGAmK0GmfBc="; + }; + docQuicktourTokenizer = fetchurl { + url = "https://s3.amazonaws.com/models.huggingface.co/bert/anthony/doc-quicktour/tokenizer.json"; + hash = "sha256-ipY9d5DR5nxoO6kj7rItueZ9AO5wq9+Nzr6GuEIfIBI="; + }; + openaiVocab = fetchurl { + url = "https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-vocab.json"; + sha256 = "0y40gc9bixj5rxv674br1rxmxkd3ly29p80x1596h8yywwcrpx7x"; + }; + openaiMerges = fetchurl { + url = "https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-merges.txt"; + sha256 = "09a754pm4djjglv3x5pkgwd6f79i2rq8ydg0f7c3q1wmwqdbba8f"; + }; + cargoLock = fetchurl { + url = "https://raw.githubusercontent.com/NixOS/nixpkgs/f2ca6788911a1a160089a0575bdfa4170c4e4f83/pkgs/development/python-modules/tokenizers/Cargo.lock"; + hash = "sha256-KPU7eCrir9oemFhfsdmvOcRjaunQncpszwinRg0FQUE="; + }; +in +buildPythonPackage rec { + pname = "tokenizers"; + version = "0.13.3"; + + disabled = pythonOlder "3.7"; + + src = fetchFromGitHub { + owner = "huggingface"; + repo = pname; + rev = "python-v${version}"; + hash = "sha256-QZG5jmr3vbyQs4mVBjwVDR31O66dUM+p39R0htJ1umk="; + }; + + postPatch = '' + ln -s ${cargoLock} Cargo.lock + ''; + + RUSTFLAGS = "-A invalid_reference_casting"; + cargoDeps = rustPlatform.importCargoLock { + lockFile = cargoLock; + }; + + sourceRoot = "source/bindings/python"; + + nativeBuildInputs = [ + pkg-config + setuptools-rust + ] ++ (with rustPlatform; [ + cargoSetupHook + rust.cargo + rust.rustc + ]); + + buildInputs = [ + openssl + ]; + + propagatedBuildInputs = [ + numpy + ]; + + nativeCheckInputs = [ + datasets + pytestCheckHook + requests + ]; + + postUnpack = '' + # Add data files for tests, otherwise tests attempt network access + mkdir $sourceRoot/tests/data + ( cd $sourceRoot/tests/data + ln -s ${robertaVocab} roberta-base-vocab.json + ln -s ${robertaMerges} roberta-base-merges.txt + ln -s ${albertVocab} albert-base-v1-tokenizer.json + ln -s ${bertVocab} bert-base-uncased-vocab.txt + ln -s ${docPipelineTokenizer} bert-wiki.json + ln -s ${docQuicktourTokenizer} tokenizer-wiki.json + ln -s ${norvigBig} big.txt + ln -s ${openaiVocab} openai-gpt-vocab.json + ln -s ${openaiMerges} openai-gpt-merges.txt ) + ''; + + preCheck = '' + export HOME=$(mktemp -d); + ''; + + pythonImportsCheck = [ + "tokenizers" + ]; + + disabledTests = [ + # Downloads data using the datasets module + "TestTrainFromIterators" + # Those tests require more data + "test_from_pretrained" + "test_from_pretrained_revision" + "test_continuing_prefix_trainer_mistmatch" + ]; + + meta = with lib; { + description = "Fast State-of-the-Art Tokenizers optimized for Research and Production"; + homepage = "https://github.com/huggingface/tokenizers"; + license = licenses.asl20; + maintainers = with maintainers; [ ]; + platforms = platforms.unix; + }; +}