[tests] downgrade tokenizer to 0.13 to fix unhandle ge.Scalar Aten IR…

… node Signed-off-by: Avimitin <[email protected]>
chipsalliance · Aug 30, 2024 · 45d9821 · 45d9821
1 parent b04364f
commit 45d9821
Show file tree

Hide file tree

Showing 2 changed files with 167 additions and 3 deletions.
diff --git a/nix/pkgs/buddy-mlir.nix b/nix/pkgs/buddy-mlir.nix
@@ -2,14 +2,32 @@
 , ninja
 , llvmPackages_17
 , fetchFromGitHub
-, fetchpatch
+, fetchurl
+, rustPlatform
 , python3
 , callPackage
 }:
 let
   stdenv = llvmPackages_17.stdenv;
   bintools = llvmPackages_17.bintools;
 
+  downgradedPyPkgs = python3.override {
+    packageOverrides = final: prev: {
+      tokenizers = (final.callPackage ./tokenizer-013.nix { });
+
+      transformers = (prev.transformers.overridePythonAttrs (old: rec {
+        version = "4.33.1";
+
+        src = fetchFromGitHub {
+          owner = "huggingface";
+          repo = "transformers";
+          rev = "refs/tags/v${version}";
+          hash = "sha256-Z78I9S8g9WexoX6HhxwbOD0K0awCTzsqW1ZiWObQNw0=";
+        };
+      }));
+    };
+  };
+
   buddy-llvm = callPackage ./buddy-llvm.nix { inherit stdenv python3; };
   self = stdenv.mkDerivation {
     pname = "buddy-mlir";
@@ -49,12 +67,12 @@ let
       llvm = buddy-llvm;
 
       # Below three fields are black magic that allow site-packages automatically imported with nixpkgs hooks
-      pythonModule = python3;
+      pythonModule = downgradedPyPkgs;
       pythonPath = [ ];
       requiredPythonModules = [ ];
 
       # nix run buddy-mlir.pyenv to start a python with PyTorch/LLVM MLIR/Buddy Frontend support
-      pyenv = python3.withPackages (ps: [
+      pyenv = downgradedPyPkgs.withPackages (ps: [
         self
         ps.torch
 

diff --git a/nix/pkgs/tokenizer-013.nix b/nix/pkgs/tokenizer-013.nix
@@ -0,0 +1,146 @@
+{ lib
+, buildPythonPackage
+, datasets
+, fetchFromGitHub
+, fetchurl
+, numpy
+, openssl
+, pkg-config
+, pytestCheckHook
+, pythonOlder
+, requests
+, rustPlatform
+, setuptools-rust
+}:
+
+let
+  # See https://github.com/huggingface/tokenizers/blob/main/bindings/python/tests/utils.py for details
+  # about URLs and file names
+  robertaVocab = fetchurl {
+    url = "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-vocab.json";
+    sha256 = "0m86wpkfb2gdh9x9i9ng2fvwk1rva4p0s98xw996nrjxs7166zwy";
+  };
+  robertaMerges = fetchurl {
+    url = "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-merges.txt";
+    sha256 = "1idd4rvkpqqbks51i2vjbd928inw7slij9l4r063w3y5fd3ndq8w";
+  };
+  albertVocab = fetchurl {
+    url = "https://s3.amazonaws.com/models.huggingface.co/bert/albert-base-v1-tokenizer.json";
+    sha256 = "1hra9pn8rczx7378z88zjclw2qsdrdwq20m56sy42s2crbas6akf";
+  };
+  bertVocab = fetchurl {
+    url = "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt";
+    sha256 = "18rq42cmqa8zanydsbzrb34xwy4l6cz1y900r4kls57cbhvyvv07";
+  };
+  norvigBig = fetchurl {
+    url = "https://norvig.com/big.txt";
+    sha256 = "0yz80icdly7na03cfpl0nfk5h3j3cam55rj486n03wph81ynq1ps";
+  };
+  docPipelineTokenizer = fetchurl {
+    url = "https://s3.amazonaws.com/models.huggingface.co/bert/anthony/doc-pipeline/tokenizer.json";
+    hash = "sha256-i533xC8J5CDMNxBjo+p6avIM8UOcui8RmGAmK0GmfBc=";
+  };
+  docQuicktourTokenizer = fetchurl {
+    url = "https://s3.amazonaws.com/models.huggingface.co/bert/anthony/doc-quicktour/tokenizer.json";
+    hash = "sha256-ipY9d5DR5nxoO6kj7rItueZ9AO5wq9+Nzr6GuEIfIBI=";
+  };
+  openaiVocab = fetchurl {
+    url = "https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-vocab.json";
+    sha256 = "0y40gc9bixj5rxv674br1rxmxkd3ly29p80x1596h8yywwcrpx7x";
+  };
+  openaiMerges = fetchurl {
+    url = "https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-merges.txt";
+    sha256 = "09a754pm4djjglv3x5pkgwd6f79i2rq8ydg0f7c3q1wmwqdbba8f";
+  };
+  cargoLock = fetchurl {
+    url = "https://raw.githubusercontent.com/NixOS/nixpkgs/f2ca6788911a1a160089a0575bdfa4170c4e4f83/pkgs/development/python-modules/tokenizers/Cargo.lock";
+    hash = "sha256-KPU7eCrir9oemFhfsdmvOcRjaunQncpszwinRg0FQUE=";
+  };
+in
+buildPythonPackage rec {
+  pname = "tokenizers";
+  version = "0.13.3";
+
+  disabled = pythonOlder "3.7";
+
+  src = fetchFromGitHub {
+    owner = "huggingface";
+    repo = pname;
+    rev = "python-v${version}";
+    hash = "sha256-QZG5jmr3vbyQs4mVBjwVDR31O66dUM+p39R0htJ1umk=";
+  };
+
+  postPatch = ''
+    ln -s ${cargoLock} Cargo.lock
+  '';
+
+  RUSTFLAGS = "-A invalid_reference_casting";
+  cargoDeps = rustPlatform.importCargoLock {
+    lockFile = cargoLock;
+  };
+
+  sourceRoot = "source/bindings/python";
+
+  nativeBuildInputs = [
+    pkg-config
+    setuptools-rust
+  ] ++ (with rustPlatform; [
+    cargoSetupHook
+    rust.cargo
+    rust.rustc
+  ]);
+
+  buildInputs = [
+    openssl
+  ];
+
+  propagatedBuildInputs = [
+    numpy
+  ];
+
+  nativeCheckInputs = [
+    datasets
+    pytestCheckHook
+    requests
+  ];
+
+  postUnpack = ''
+    # Add data files for tests, otherwise tests attempt network access
+    mkdir $sourceRoot/tests/data
+    ( cd $sourceRoot/tests/data
+      ln -s ${robertaVocab} roberta-base-vocab.json
+      ln -s ${robertaMerges} roberta-base-merges.txt
+      ln -s ${albertVocab} albert-base-v1-tokenizer.json
+      ln -s ${bertVocab} bert-base-uncased-vocab.txt
+      ln -s ${docPipelineTokenizer} bert-wiki.json
+      ln -s ${docQuicktourTokenizer} tokenizer-wiki.json
+      ln -s ${norvigBig} big.txt
+      ln -s ${openaiVocab} openai-gpt-vocab.json
+      ln -s ${openaiMerges} openai-gpt-merges.txt )
+  '';
+
+  preCheck = ''
+    export HOME=$(mktemp -d);
+  '';
+
+  pythonImportsCheck = [
+    "tokenizers"
+  ];
+
+  disabledTests = [
+    # Downloads data using the datasets module
+    "TestTrainFromIterators"
+    # Those tests require more data
+    "test_from_pretrained"
+    "test_from_pretrained_revision"
+    "test_continuing_prefix_trainer_mistmatch"
+  ];
+
+  meta = with lib; {
+    description = "Fast State-of-the-Art Tokenizers optimized for Research and Production";
+    homepage = "https://github.com/huggingface/tokenizers";
+    license = licenses.asl20;
+    maintainers = with maintainers; [ ];
+    platforms = platforms.unix;
+  };
+}